码迷,mamicode.com
首页 > 其他好文 > 详细

爬虫实例(三):模拟登陆新浪

时间:2016-09-03 21:02:25      阅读:680      评论:0      收藏:0      [点我收藏+]

标签:

1.在模拟登陆的过程中第一步需要得到登陆前信息,用户名和密码通过js预先加密,所以必须要先将js预先加密的servertime和nonce和pubkey得到,下面json模块和re得到预先加密的信息

技术分享
 1 #---coding:utf-8---
 2 import urllib2
 3 import re
 4 import json
 5 def get_servertime():
 6     url="http://login.sina.com.cn/sso/prelogin.php?entry=weibo&callback=sinaSSOController.preloginCallBack&su=dW5kZWZpbmVk&client=ssologin.js(v1.3.18)&_=1329806375939"
 7     
 8     data=urllib2.urlopen(url).read()
 9     
10     p=re.compile(\((.*)\))
11     
12     try:
13         json_data=p.search(data).group(1)
14         print json_data
15         data=json.loads(json_data)
16         ##采用json。loads进行解码,得到dict类型数据,从之中得到需要的数据
17         servertime = str(data[servertime])
18         nonce = data[nonce]
19         print servertime,"\n",nonce
20     except:
21        print "Get servertime data"
22        
23 if __name__==__main__:
24     get_servertime()    
25         
View Code

结果上显示:

技术分享
 1 {"retcode":0,"servertime":1472783606,"pcid":"gz-32dce7bbd55e33948992c2978d847ff601de","nonce":"26ISKM","pubkey":"-----BEGIN PUBLIC KEY-----\nMIGfMA0GCSqGSIb3DQEBAQUAA4GNADCBiQKBgQDrKjhWhmGIf6GAvdtcq9XyHHv9\nWcCQyy0kWoesJTBiiCcpKT5VBjUFCOf5qju3f0MzIxSQ+RX21jxV\/i8IpJs1P0RK\n05k8rMAtt4Sru45CqbG7\/\/s4vhjXjoeg5Bubj3OpKO4MzuH2c5iEuXd+T+noihu+\nSVknrEp5mzGB1kQkQwIDAQAB\n-----END PUBLIC KEY-----","rsakv":"1330428213","is_openlock":0,"exectime":10}
 2 1472783606 
 3 26ISKM 
 4 -----BEGIN PUBLIC KEY-----
 5 MIGfMA0GCSqGSIb3DQEBAQUAA4GNADCBiQKBgQDrKjhWhmGIf6GAvdtcq9XyHHv9
 6 WcCQyy0kWoesJTBiiCcpKT5VBjUFCOf5qju3f0MzIxSQ+RX21jxV/i8IpJs1P0RK
 7 05k8rMAtt4Sru45CqbG7//s4vhjXjoeg5Bubj3OpKO4MzuH2c5iEuXd+T+noihu+
 8 SVknrEp5mzGB1kQkQwIDAQAB
 9 -----END PUBLIC KEY----- 
10 1330428213
View Code
在这里pwd密码采用RAS进行三次加密,用户名采用的是base64的加密机制
 1 #---coding:utf-8---
 2 import urllib2
 3 import re
 4 import json
 5 import hashlib
 6 import urllib
 7 def get_servertime():
 8     url="http://login.sina.com.cn/sso/prelogin.php?entry=weibo&callback=sinaSSOController.preloginCallBack&su=dW5kZWZpbmVk&client=ssologin.js(v1.3.18)&_=1329806375939"
 9     
10     data=urllib2.urlopen(url).read()
11     
12     p=re.compile(\((.*)\))
13     
14     try:
15         json_data=p.search(data).group(1)
16         
17         data=json.loads(json_data)
18         ##采用json。loads进行解码,得到dict类型数据,从之中得到需要的数据
19         servertime = str(data[servertime])
20         nonce = data[nonce]
21         pubkey=data[pubkey]
22         rsakv=data[rsakv]
23         print servertime,"\n",nonce,"\n",pubkey,"\n",rsakv
24        # return servertime,nonce,rsakv
25     except:
26        print "Get servertime data"
27        return None
28 #对密码进行加密,采用RSA机制进行三次加密
29 def get_pwd(pwd,servertime,nonce):
30     pwd1=hashlib.sha1(pwd).hexdigest()
31     pwd2=hashlib.sha1(pwd1).hexdigest()
32     pwd3_=pwd2+servertime+nonce
33     pwd3 = hashlib.sha1(pwd3_).hexdigest()
34     return pwd3
35 
36 def get_user(username):
37     ##采用base64加密
38     username_=urllib.quote(username)
39     username=base64.encodestring(username_)[:-1]
40     return username
41     
42           
43 if __name__==__main__:
44     get_servertime()    
45         

 

 完成的代码,对于模拟登陆中最重要的对其加密机制,之后就是对其进行模拟登陆一般步骤,post_data和对头文件进行包装:

在这里使用的是urllib、cookie库

  1 import urllib.request
  2 import http.cookiejar
  3 import base64
  4 import json
  5 import urllib.parse
  6 import rsa
  7 import binascii
  8 import os
  9 import re
 10 import time
 11 import datetime
 12 import random
 13 
 14 
 15 username=‘‘
 16 password=‘‘
 17 
 18 
 19 cookiejar=http.cookiejar.LWPCookieJar(username)
 20 cookie=urllib.request.HTTPCookieProcessor(cookiejar)
 21 httphandle=urllib.request.HTTPHandler()
 22 opener=urllib.request.build_opener(cookie,httphandle)
 23 urllib.request.install_opener(opener)
 24 
 25 
 26 publickey=EB2A38568661887FA180BDDB5CABD5F21C7BFD59C090CB2D245A87AC253062882729293E5506350508E7F9AA3BB77F4333231490F915F6D63C55FE2F08A49B353F444AD3993CACC02DB 27 784ABBB8E42A9B1BBFFFB38BE18D78E87A0E41B9B8F73A928EE0CCEE1F6739884B9777E4FE9E88A1BBE495927AC4A799B3181D6442443
 28 pubkey=int(publickey,16)
 29 
 30 postdata={
 31     entry:weibo,
 32     gateway:1,
 33     from:‘‘,
 34     savestate:7,
 35     useticket:1,
 36     pagerefer:http://login.sina.com.cn/sso/logout.php?entry=miniblog&r=http%3A%2F%2Fweibo.com%2Flogout.php%3Fbackurl%3D%252F,
 37     vsnf:1,
 38     su:‘‘,
 39     service:miniblog,
 40     servertime:‘‘,
 41     nonce:‘‘,
 42     pwencode:rsa2,
 43     rsakv:1330428213,
 44     sp:‘‘,
 45     sr:1920*1080,
 46     encoding:UTF-8,
 47     prelt:269,
 48     url:http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack,
 49     returntype:META,
 50     showpin:0
 51     }
 52 def gettime():
 53     return time.mktime(datetime.datetime.now().timetuple())
 54     
 55 def openurl(url,chart=utf-8,data=None):
 56     result=opener.open(url,data)
 57     result=result.read()
 58     if(chart!=null):
 59         return result.decode(chart)
 60     else:
 61         return result
 62 
 63 """
 64 login_about get begin
 65 """
 66 def b64(sth):
 67     return base64.b64encode(sth.encode()).decode(utf-8)
 68 
 69 def get_su():
 70     string=urllib.parse.quote(username)
 71     return b64(string)
 72 
 73 def get_sp(st,nc):
 74     key=rsa.PublicKey(pubkey,65537)
 75     message=str(st)+\t+str(nc)+\n+password
 76     sp=rsa.encrypt(message.encode(),key)
 77     sp=binascii.b2a_hex(sp)
 78     return sp.decode(utf-8)
 79 def get_servertime():#and nonce
 80 
 81     url=http://login.sina.com.cn/sso/prelogin.php?entry=weibo&su=%s&checkpin=1&rsakt=mod %(get_su())
 82     page=opener.open(url)
 83     data=json.loads(page.read().decode(utf-8))
 84     
 85     result=[]
 86     result.append(str(data[servertime]))
 87     result.append(str(data[nonce]))
 88     result.append(str(data[pcid]))
 89     return result
 90 """
 91 login_about get end
 92 match begin
 93 """
 94 def match(pattern,string):
 95     a=re.compile(pattern)
 96     result=re.findall(a,string)
 97     return result
 98 def match_fanscount(string):
 99     a=rfans" >([0-9]+)
100     result=match(a,string)
101     if(result!=[]):
102         return result[0]
103     else:
104         return 20000
105 def match_login_url(string):
106     a=r[a-zA-z]+://[^\s]*=0
107     result=match(a,string)
108     return result[0]
109 def match_uid(string):
110     a=rusercard="id=([0-9]+)" href="
111     result=match(a,string)
112     return result
113 def match_name(string):
114     a=r"CONFIG\[‘onick‘\]=‘(.+)‘"
115     result=match(a,string)
116     return result[0]
117 def replace_(st):
118     a=\\
119     
120     return st.replace(a,‘‘)
121 """
122 match end
123 do login begin
124 """
125 def login(postdata):
126     result=get_servertime()
127     servertime=result[0]
128     nonce=result[1]
129 
130     postdata[su]=get_su()
131     postdata[sp]=get_sp(servertime,nonce)
132     postdata[servertime]=servertime
133     postdata[nonce]=nonce
134     
135     postdata=urllib.parse.urlencode(postdata)
136     url=http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.18)
137     
138     headers={
139     User-Agent:Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.94 Safari/537.36 OPR/27.0.1689.66 (Edition Baidu),
140          }
141     req=urllib.request.Request(url,postdata.encode(),headers)
142     text=urllib.request.urlopen(req)
143     text=text.read()
144     
145     text=text.decode(gbk)
146     result=match_login_url(text)
147     opener.open(result)
148     cookiejar.save()
149 def auto_login():
150     cookiejar.load()
151     html=openurl(http://weibo.com/,gbk)
152     url=match_login_url(html)
153     opener.open(url)
154     """
155     opener.open(‘http://passport.weibo.cn/sso/crossdomain?action=login&savestate=1&retcode=0‘)
156     result=openurl(‘http://weibo.com/2598335181/follow?rightmod=1&wvr=6‘)
157     print(result)"""
158 
159 """
160 """
161 
162 
163 if(os.path.exists(username)==True):
164     print(检测到cookie,自动登录)
165     auto_login()
166 else:
167     login(postdata)

 在接下来采用requests下session模块,在session会话条件下,不需要再每一次提交的过程中都要保存cookies,在session.post()方法的时候就已经将cookies自动提交上去了,使用Seesion一直与服务器保持会话:

但是自己编写的程序在最后对网页上的内容进行重定向的时候出现了一些问题:

  1 #!/usr/bin/env python
  2 #---coding:utf-8----
  3 import requests
  4 import json
  5 import urllib
  6 import re
  7 import base64
  8 import rsa
  9 import binascii
 10 from matplotlib._image import Image
 11 
 12 username="*********"
 13 pwd=***************
 14 def get_su(username):
 15     username_html=urllib.quote(username).encode(utf-8)
 16     su=base64.b64encode(username_html).decode(utf-8)
 17    # print su
 18     return su
 19 
 20 def get_sth(su):
 21     # 改字典内的数据经过精简,只有在这些数据存在下才不影响获得所需的准确数据
 22     payload = {entry: weibo, rsakt: mod, su: su, checkpin: 1 }
 23     res = requests.get(http://login.sina.com.cn/sso/prelogin.php,
 24                        params=payload).text
 25     res = eval(res)##将字符串以一种理解的文档进行表达,得到pubkey可以运用任何形式
 26    # print(res)
 27     return res
 28 
 29 def get_sp(pwd,nonce,servertime,pubkey):
 30     ##创建公钥的长度:pubkey的公钥在新浪中得到是固定的
 31     key=rsa.PublicKey(int(pubkey,16),65537)
 32     message=str(servertime)+\t+str(nonce)+\n+pwd
 33     
 34     ##对集进行加密:
 35     passwd=rsa.encrypt(message.encode(utf-8),key)
 36     ##将加密信息转换成16进制
 37     sp=binascii.b2a_hex(passwd)
 38     return sp
 39 def get_pin(pcid):
 40     payload={s:0,p:pcid}
 41     pin_url = "http://login.sina.com.cn/cgi/pin.php"
 42     
 43     Res=requests.Session().get(pin_url,params=payload)
 44     
 45     ##把图片进行显示
 46     with open(cha.jpg,w+) as f:
 47         f.write(Res.content)
 48         f.close()
 49     try:
 50         
 51         im=Image.open(cha.jpg)
 52         im.show()
 53         im.close()
 54     except:
 55         print(在当前目录下没有找到图片)
 56     
 57 if __name__==__main__:
 58     su=get_su(username)##直接对用户名进行加密,采用requets模块得到密码加密信息
 59     res=get_sth(su)
 60     print res
 61     nonce=res[nonce]
 62     rsakv=res[rsakv]
 63     servertime=res[servertime]
 64     pcid=res[pcid]
 65     pubkey=res[pubkey]
 66     showin=False##判断代码是否用验证码
 67     ##获取其中密码,通过RSA加密
 68     sp=get_sp(pwd,nonce,servertime,pubkey)
 69     print 加密用户名:,su,加密的密码:,sp
 70     
 71     ####开始向客户端进行post_data:
 72     
 73     headers = {
 74             User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) 
 75                           AppleWebKit/537.36 (KHTML, like Gecko) 
 76                           Chrome/47.0.2526.80 Safari/537.36
 77         }
 78     payload = {
 79         entry: weibo,
 80         gateway: 1,
 81         from: ‘‘,
 82         savestate: 7,
 83         userticket: 1,
 84         pagerefer: http://login.sina.com.cn/sso/logout.php?entry=miniblog
 85                      &r=http%3A%2F%2Fweibo.com%2Flogout.php%3Fbackurl%3D%252F,
 86         vsnf: 1,
 87         su: su,
 88         service: miniblog,
 89         servertime: servertime,
 90         nonce: nonce,
 91         pwencode: rsa2,
 92         rsakv: rsakv,
 93         sp: sp,
 94         encoding: UTF-8,
 95         prelt: 106,
 96         url: http://weibo.com/ajaxlogin.php?framelogin=1&callback=
 97                parent.sinaSSOController.feedBackUrlCallBack,
 98         returntype: META
 99     }
100     
101     ##判断其中是否要验证码:图像或者数字
102     if showin:
103         pcid=res[pcid]
104         get_pin(pcid)
105         
106         payload[door]=input(请输入验证码:)
107         Res=requests.session().post(http://login.sina.com.cn/sso/login.php?client=
108                            ssologin.js(v1.4.18),data=payload,hearders=headers)
109         
110         content=Res.content.decode(GBK)
111         print content
112     else:
113         res =requests. session().post(http://login.sina.com.cn/sso/login.php?client=
114                            ssologin.js(v1.4.18), data=payload, headers=headers)
115         res = res.content.decode(GBK)
116        # print(res),type(res)
117        
118     ##得到内容进行重定向,发现重定向不成功,********
119     pattern = rlocation\.replace\([\‘"](.*?)[\‘"]\)
120     login_url=re.findall(pattern,res)
121     #print login_url
122     ##同样采用这种形式的正则
123     res1=str(res.encode(GBK))##必须将Unicode转换成某种编码方式才能转换str类型
124     #print res1
125     p=re.compile(rlocation\.replace\(\"(.*)\"\))
126     
127     login_url=p.search(res1).group(1)
128     print login_url
129     
130     ###通过得到login——url之后进行处理
131     
132     page=requests.session().get(login_url,headers=headers)
133     
134     print page.content.decode(gb2312)
135     
136     
137     uuid_res = re.findall(r"uniqueid":"(.*?)", page.content)
138     print(uuid_res)
139     
140     weibo = requests.session().get(http://weibo.com/%s/profile?topnav=1&wvr=6&is_all=1 % uuid_res)
141     id_pa = r<title>(.*?)</title>
142     
143     #weiboID = re.findall(id_pa, weibo.content.decode("utf-8"), re.S)[0]
144      
145     #print weiboID
146     
147     
148     
149     
150         
151              
152                           
153     
154     

 

爬虫实例(三):模拟登陆新浪

标签:

原文地址:http://www.cnblogs.com/woainifanfan/p/5832634.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!