标签:网络爬虫 python webbrowser taobao cookies
终于实现了登陆淘宝,这个验证码机制困惑了我好几天啊。
代码中验证码提供有两种方式,第一种通过webbrowser的open直接在浏览器中打开含有验证码的图片,第二种就是将其以jepg格式存在
C:\\Users\\Administrator\\Desktop\\checkcode.jepg。你可以根据自己主机的用户名更改路径。同时这个代码必须先指定用户名和账号也
可以实时输入账号的密码,小小修改一下代码就可以。
显示根据httpfox分析网页数据,之后再使用正则扣除你想要的数据,将其显示出来。过几天可能会写一个从淘宝上抓取信息的爬虫,现在
还没有使用各种爬虫框架,基本都是使用urllib、urllib2等比较基础的包,过一阶段可能会学习到框架,我也只是一个菜鸟,一个想要飞的
更高的菜鸟罢了,代码里我写了比较详细的注释,基本逻辑就是初始化设置cookie等->发送post数据从返回信息中抓取验证码->再次发送
携带验证码的post数据->从返回信息中提取登陆状态。
# -*- coding: utf-8 -*- import urllib import urllib2 import cookielib #设置opener import re #正则表达式 import webbrowser #打开界面 #淘宝登录地址 tbLoginUrl = "https://login.taobao.com/member/login.jhtml" #存放验证码图片的网址 checkCodeUrl = '' #post请求头部 headers = { 'x-requestted-with': 'XMLHttpRequest', 'Accept-Language': 'zh-cn', 'Accept-Encoding': 'gzip, deflate', 'ContentType': 'application/x-www-form-urlencoded; chartset=UTF-8', 'Host': 'login.taobao.com', 'DNT': 1, 'Cache-Control': 'no-cache', 'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:14.0) Gecko/20100101 Firefox/14.0.1', 'Referer' : 'https://login.taobao.com/member/login.jhtml?redirectURL=http%3A%2F%2Fwww.taobao.com%2F', 'Connection' : 'Keep-Alive' } #设置用户名,密码 username = "" password = "" #同样可以采用实时输入模式 #username = raw_input("Please input your username of taobao: ") #password = raw_input("Please input your password of taobao: ") #请求数据包 postData = { 'TPL_username':username, 'TPL_password':password, "need_check_code" : "false", "loginsite": 0, "newlogin":1, 'TPL_redirect_url':'', 'from':'tbTop', 'fc':"default", 'style':'default', 'css_style':'', 'tid':'', 'support':'000001', 'CtrlVersion':'1,0,0,7', 'loginType':3, 'minititle':'', 'minipara' :'', "umto":"NAN", 'pstrong':2, 'llnick':'', 'sign':'', 'need_sign':'', "isIgnore":'', "full_redirect":'', 'popid':'', 'callback':'1', 'guf':'', 'not_duplite_str':'', 'need_user_id':'', 'poy':'', 'gvfdcname':10, 'from_encoding':'', "sub":'', "allp":'', 'action':'Authenticator', 'event_submit_do_login':'anything', 'longLogin':0 } #登录主函数 def loginToTaobao(): #设置代理IP,防止频率过高本地IP被封 urllib2.ProxyHandler({'http':'http://120.193.146.97:843'}) #cookie 自动处理器 cookiejar = cookielib.LWPCookieJar()#LWPCookieJar提供可读写操作的cookie文件,存储cookie对象 cookieSupport= urllib2.HTTPCookieProcessor(cookiejar) opener = urllib2.build_opener(cookieSupport, urllib2.HTTPHandler) urllib2.install_opener(opener) #打开登陆页面 taobao = urllib2.urlopen(tbLoginUrl) resp = taobao.read().decode("gbk") #此时直接发送post数据包到登陆地址 sendPostData(tbLoginUrl, postData, headers) #打开验证码图片的网页 webbrowser.open_new_tab(checkCodeUrl) #将验证码图片下载到本地 if checkCodeUrl != "": getCheckCode(checkCodeUrl) sendPostData(tbLoginUrl, postData, headers) #发送post数据到登陆网址 def sendPostData(url, data, header): print "+"*20+"sendPostData"+"+"*20 data = urllib.urlencode(data) request = urllib2.Request(url, data, header) response = urllib2.urlopen(request) text = response.read().decode("gbk") info = response.info() status = response.getcode() response.close() print status print info print "Response:", text #如果为第一次调用,则进入获取验证码的函数 if checkCodeUrl == "": global checkCodeUrl checkCodeUrl = getIdenCode(text) print checkCodeUrl result = handleResponseText(text) print result if result["state"]: print "successfully login in!" else: print "failed to login in, error message: ",result["message"] #利用正则得到存放二维码图片的网址 def getIdenCode(page): #得到验证码的图片 pattern = re.compile('ccurl":"(.*?)"',re.S) #匹配的结果 matchResult = re.search(pattern,page) #已经匹配得到内容,并且验证码图片链接不为空,返回(.*?)中的内容 if matchResult and matchResult.group(1): print matchResult.group(1) return matchResult.group(1) else: print u"没有找到验证码内容" return False #从数据中抓取网页登陆的状态,并输出到界面上 def handleResponseText(text): """处理登录返回结果""" global checkCodeUrl print "+"*20+"handleResponseText"+"+"*20 text = text.replace(',', ' ') responseData = {"state": False, "message" : "", "code" : ""} m1 = re.match(r'\{?"state":(\w*)\ ', text) if m1 is not None: s = m1.group(1) if s == "true": responseData["state"] = True else: m2 = re.search(r'"message":"(\S*)"( |})', text) if m2 is not None: msg = m2.group(1) responseData["message"] = msg.encode("utf-8") else: print "failed to get the error message" m3 = re.match(r'.+\"code":(\w*)\ ', text) if m3 is not None: code = m3.group(1) responseData["code"] = code else: print "failed to get the error code" return responseData #将图片存在本地路径中 def getCheckCode(url): print "+"*20+"getCheckCode"+"+"*20 response = urllib2.urlopen(url) status = response.getcode() picData = response.read() path = "C:\\Users\\Administrator\\Desktop\\checkcode.jepg" if status == 200: localPic = open(path, "wb") localPic.write(picData) localPic.close() print "请到%s,打开验证码图片"%path checkCode = raw_input("请输入验证码:") print checkCode, type(checkCode) postData["TPL_checkcode"] = checkCode postData["need_check_code"] = "true" else: print "failed to get Check Code, status: ",status if __name__ == "__main__": print "-"*54 print "|"+"+"*20+"京东放养的爬虫"+"+"*20+"|" print "-"*54+"\n\n" loginToTaobao()
标签:网络爬虫 python webbrowser taobao cookies
原文地址:http://blog.csdn.net/djd1234567/article/details/45296619