码迷,mamicode.com
首页 > 编程语言 > 详细

python爬虫:登录百度账户,并上传文件到百度云盘

时间:2017-06-23 10:28:28      阅读:683      评论:0      收藏:0      [点我收藏+]

标签:dig   hunk   相关   isa   zlib   country   floor   multipart   抓包   

login.js文件:

/**
 * Created by resolvewang on 2017/4/15.
 */
function getGid() {
    return "xxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx".replace(/[xy]/g, function (e) {
        var t = 16 * Math.random() | 0, n = "x" == e ? t : 3 & t | 8;
        return n.toString(16)
    }).toUpperCase()
}

function  getCallback() {
    return "bd__cbs__" + Math.floor(2147483648 * Math.random()).toString(36)
}

Pyhton实现代码:

#-*- coding:utf-8 -*-
__author__ = Administrator

import time
import json
import re
import requests
import execjs
import base64
from urllib.parse import urlencode
from requests_toolbelt import MultipartEncoder
from Crypto.Cipher import PKCS1_v1_5
from Crypto.PublicKey import RSA
from hashlib import md5
from zlib import crc32

try:
    requests.packages.urllib3.disable_warnings()
except:
    pass

headers = {User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 
                         (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36,
           }

# 全局的session
session = requests.session()
session.get(https://pan.baidu.com, headers=headers)

class BufferReader(MultipartEncoder):
    """将multipart-formdata转化为stream形式的Proxy类
    """

    def __init__(self, fields, boundary=None, callback=None, cb_args=(), cb_kwargs=None):
        self._callback = callback
        self._progress = 0
        self._cb_args = cb_args
        self._cb_kwargs = cb_kwargs or {}
        super(BufferReader, self).__init__(fields, boundary)

    def read(self, size=None):
        chunk = super(BufferReader, self).read(size)
        self._progress += int(len(chunk))
        self._cb_kwargs.update({
            size: self._len,
            progress: self._progress
        })
        if self._callback:
            try:
                self._callback(*self._cb_args, **self._cb_kwargs)
            except:  # catches exception from the callback
                # raise CancelledError(‘The upload was cancelled.‘)
                pass
        return chunk

def _get_runntime():
    """
    :param path: 加密js的路径,注意js中不要使用中文!估计是pyexecjs处理中文还有一些问题
    :return: 编译后的js环境,不清楚pyexecjs这个库的用法的请在github上查看相关文档
    """
    phantom = execjs.get()  # 这里必须为phantomjs设置环境变量,否则可以写phantomjs的具体路径
    with open(login.js, r) as f:
        source = f.read()
    return phantom.compile(source)

def get_gid():
    return _get_runntime().call(getGid)

def get_callback():
    return _get_runntime().call(getCallback)

def _get_curtime():
    return int(time.time()*1000)

# 抓包也不是百分百可靠啊,这里?getapi一定要挨着https://passport.baidu.com/v2/api/写,才会到正确的路由
def get_token(gid, callback):
    cur_time = _get_curtime()
    get_data = {
        tpl: netdisk,
        subpro: netdisk_web,
        apiver: v3,
        tt: cur_time,
        class: login,
        gid: gid,
        logintype: basicLogin,
        callback: callback
    }
    headers.update(dict(Referer=http://pan.baidu.com/, Accept=*/*, Connection=keep-alive, Host=passport.baidu.com))
    resp = session.get(url=https://passport.baidu.com/v2/api/?getapi, params=get_data, headers=headers)
    if resp.status_code == 200 and callback in resp.text:
        # 如果json字符串中带有单引号,会解析出错,只有统一成双引号才可以正确的解析
        #data = eval(re.search(r‘.*?\((.*)\)‘, resp.text).group(1))
        data = json.loads(re.search(r.*?\((.*)\), resp.text).group(1).replace("", "))
        return data.get(data).get(token)
    else:
        print(获取token失败)
        return None

def get_rsa_key(token, gid, callback):
    cur_time = _get_curtime()
    get_data = {
        token: token,
        tpl: netdisk,
        subpro: netdisk_web,
        apiver: v3,
        tt: cur_time,
        gid: gid,
        callback: callback,
    }
    resp = session.get(url=https://passport.baidu.com/v2/getpublickey, headers=headers, params=get_data)
    if resp.status_code == 200 and callback in resp.text:
        data = json.loads(re.search(r.*?\((.*)\), resp.text).group(1).replace("", "))
        return data.get(pubkey), data.get(key)
    else:
        print(获取rsa key失败)
        return None

def encript_password(password, pubkey):
    """
    import rsa
    使用rsa库加密(法一)
    pub = rsa.PublicKey.load_pkcs1_openssl_pem(pubkey.encode(‘utf-8‘))
    encript_passwd = rsa.encrypt(password.encode(‘utf-8‘), pub)
    return base64.b64encode(encript_passwd).decode(‘utf-8‘)

    """
    # pubkey必须为bytes类型
    pub=RSA.importKey(pubkey.encode(utf-8))
    #构造“加密器”
    encryptor=PKCS1_v1_5.new(pub)
    #加密的内容必须为bytes类型
    encript_passwd =encryptor.encrypt(password.encode(utf-8))
    return base64.b64encode(encript_passwd).decode(utf-8)

def login(token, gid, callback, rsakey, username, password):
    post_data = {
        staticpage: http://pan.baidu.com/res/static/thirdparty/pass_v3_jump.html,
        charset: utf-8,
        token: token,
        tpl: netdisk,
        subpro: netdisk_web,
        apiver: v3,
        tt: _get_curtime(),
        codestring: ‘‘,
        safeflg: 0,
        u: http://pan.baidu.com/disk/home,
        isPhone: ‘‘,
        detect: 1,
        gid: gid,
        quick_user: 0,
        logintype: basicLogin,
        logLoginType: pc_loginBasic,
        idc: ‘‘,
        loginmerge: true,
        foreignusername: ‘‘,
        username: username,
        password: password,
        mem_pass: on,
        # 返回的key
        rsakey: rsakey,
        crypttype: 12,
        ppui_logintime: 33554,
        countrycode: ‘‘,
        callback: parent.+callback
    }
    resp = session.post(url=https://passport.baidu.com/v2/api/?login, data=post_data, headers=headers)
    if err_no=0 in resp.text:
        print(登录成功)
    else:
        print(登录失败)

def upload(dest_path,file_handle,token):
     params = {
            method: upload,
            app_id: "250528",
            BDUSS: session.cookies[BDUSS],
            t: str(int(time.time())),
            bdstoken: token,
            path: dest_path,
            ondup: "newcopy"
        }
     # print(params)
     files = {file: (str(int(time.time())), file_handle)}
     url = https://{0}/rest/2.0/pcs/file.format(pcs.baidu.com)
     api = %s?%s % (url, urlencode(params))
     # print(api)
     body = BufferReader(files)
     # print(body)
     baibupan_header = {"Referer": "http://pan.baidu.com/disk/home",
                    "User-Agent": "netdisk;4.6.2.0;PC;PC-Windows;10.0.10240;WindowsBaiduYunGuanJia"}
     header = dict(baibupan_header.items())
     # print(headers)
     header.update({"Content-Type": body.content_type})
     response = session.post(api, data=body, verify=False, headers=header)
     return response

def rapidupload(dest_path,file_handler,token):
    """秒传一个文件
    :param file_handler: 文件handler, e.g. open(‘file‘,‘rb‘)
    :type file_handler: file

    :param dest_path: 上传到服务器的路径,包含文件名
    :type dest_path: str

    :return: requests.Response
        .. note::
            * 文件已在服务器上存在,不上传,返回示例
            {
                "path" : "/apps/album/1.jpg",
                "size" : 372121,
                "ctime" : 1234567890,
                "mtime" : 1234567890,
                "md5" : "cb123afcc12453543ef",
                "fs_id" : 12345,
                "isdir" : 0,
                "request_id" : 12314124
            }
            * 文件不存在,需要上传
            {"errno":404,"info":[],"request_id":XXX}
            * 文件大小不足 256kb (slice-md5 == content-md5) 时
            {"errno":2,"info":[],"request_id":XXX}
            * 远程文件已存在
            {"errno":-8,"info":[],"request_id":XXX}
    """

    file_handler.seek(0, 2)
    _BLOCK_SIZE = 2 ** 20
    content_length = file_handler.tell()
    file_handler.seek(0)

    # 校验段为前 256KB
    first_256bytes = file_handler.read(256 * 1024)
    slice_md5 = md5(first_256bytes).hexdigest()

    content_crc32 = crc32(first_256bytes).conjugate()
    content_md5 = md5(first_256bytes)

    while True:
        block = file_handler.read(_BLOCK_SIZE)
        if not block:
            break
        # 更新crc32和md5校验值
        content_crc32 = crc32(block, content_crc32).conjugate()
        content_md5.update(block)

    params = {
            method: rapidupload,
            app_id: "250528",
            BDUSS: session.cookies[BDUSS],
            t: str(int(time.time())),
            bdstoken: token,
            path: dest_path,
            ondup: "newcopy"
            }

    data = {
            content-length: content_length,
            content-md5: content_md5.hexdigest(),
            slice-md5: slice_md5,
            content-crc32: %d % (content_crc32.conjugate() & 0xFFFFFFFF)
            }
    baibupan_header = {"Referer": "http://pan.baidu.com/disk/home",
                    "User-Agent": "netdisk;4.6.2.0;PC;PC-Windows;10.0.10240;WindowsBaiduYunGuanJia"}
    header = dict(baibupan_header.items())
    url = https://{0}/rest/2.0/pcs/file.format(pcs.baidu.com)
    api = %s?%s % (url, urlencode(params))
    # print(api)
    response= session.post(api, data=data, verify=False,headers=header)
    return response

if __name__ == __main__:
    user=xxx  #用户名
    password=xxx  #密码

    cur_gid = get_gid()
    cur_callback = get_callback()
    cur_token = get_token(cur_gid, cur_callback)
    # print("token:%s" %(cur_token))
    cur_pubkey, cur_key = get_rsa_key(cur_token, cur_gid, cur_callback)
    encript_password = encript_password(password, cur_pubkey)
    login(cur_token, cur_gid, cur_callback, cur_key, user, encript_password)
    # print("cookies:%s" %(session.cookies[‘BDUSS‘]))

    # res=upload("/hello/temp.txt",open("temp.txt",‘rb‘),cur_token)
    # print(res.content.decode(‘utf-8‘))
    res=rapidupload("/hello/words.txt",open("words.txt",rb),cur_token)
    print(res.content.decode(utf-8))

 

python爬虫:登录百度账户,并上传文件到百度云盘

标签:dig   hunk   相关   isa   zlib   country   floor   multipart   抓包   

原文地址:http://www.cnblogs.com/yizhenfeng168/p/7067966.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!