码迷,mamicode.com
首页 > Windows程序 > 详细

新浪微博数据挖掘菜谱之一: 登录篇 (API)

时间:2014-12-28 08:11:03      阅读:355      评论:0      收藏:0      [点我收藏+]

标签:数据挖掘

#!/usr/bin/python 
# -*- coding: utf-8 -*-

'''
Created on 2014-12-28
@author: beyondzhou
@name: login.py
'''

import re, json
import urllib, urllib2, urllib3, cookielib
import base64, rsa, binascii # encrypt
from weibo import APIClient

class SmartRedirectHandler(urllib2.HTTPRedirectHandler):
    def http_error_301(self, cls, req, fp, code, msg, headers):
        result = urllib2.HTTPRedirectHandler.http_error_301(cls, req, fp, code, msg, headers)
        result.status = code
        return result

    def http_error_302(self, cls, req, fp, code, msg, headers):
        result = urllib2.HTTPRedirectHandler.http_error_302(cls, req, fp, code, msg, headers)
        result.status = code
        return result
    
def get_cookie():
    cookies = cookielib.CookieJar()
    return urllib2.HTTPCookieProcessor(cookies)
   
def get_opener(proxy=False):
    rv=urllib2.build_opener(get_cookie(), SmartRedirectHandler())
    rv.addheaders = [('User-agent', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)')]
    return rv

class SinaAPI():
    def __init__(self, CALLBACK_URL, APP_KEY, REDIRECT_URL, USER_ID, USER_PSWD):
        self.CALLBACK_URL = CALLBACK_URL
        self.APP_KEY = APP_KEY
        self.REDIRECT_URL = REDIRECT_URL
        self.USER_ID = USER_ID
        self.USER_PSWD = USER_PSWD
        self.http = urllib3.PoolManager()
        
    def get_username(self, USER_ID):
        # The Encryption Algorithm of username 
        # ssologin.js : ah.su=sinaSSOEncoder.base64.encode(m(aj));
        USER_ID_ = urllib.quote(USER_ID) # encode username, avoid error refer:@ &  
        su = base64.encodestring(USER_ID_)[:-1]
        return su
   
    def get_password_rsa(self, USER_PSWD, PUBKEY, servertime, nonce):
        # rsa Encrypt :  #when pwencode = "rsa2"
        rsaPubkey = int(PUBKEY, 16)#pubkey from 16 to 10
        key_1 = int('10001', 16) #10001 to 65537 
        key = rsa.PublicKey(rsaPubkey, key_1) #
        message = str(servertime) + "\t" + str(nonce) + "\n" + str(USER_PSWD)
        passwd = rsa.encrypt(message, key)
        passwd = binascii.b2a_hex(passwd) #to 16
        return passwd
      
    def get_parameter(self):
        su = self.get_username(self.USER_ID)
        url = "https://login.sina.com.cn/sso/prelogin.php?entry=openapi&callback=sinaSSOController.preloginCallBack&su="+su+"&rsakt=mod&checkpin=1&client=ssologin.js(v1.4.15)"
        r = self.http.request('GET', url)
        p = re.compile('\((.*)\)')
        json_data = p.search(r.data).group(1)
        data = json.loads(json_data)
        
        PUBKEY = data['pubkey']
        pcid = data['pcid']
        servertime = str(data['servertime'])
        nonce = data['nonce']
        rsakv = str(data['rsakv'])
        sp = self.get_password_rsa(self.USER_PSWD, PUBKEY, servertime, nonce)
        
        #print pcid; print servertime; print nonce; print rsakv; print sp; print su
        return pcid, servertime, nonce, rsakv, sp, su
         
    def get_ticket(self):
        pcid, servertime, nonce, rsakv, sp, su = self.get_parameter()
        fields = urllib.urlencode({
            'entry'        : 'openapi',
            'gateway'      : '1',
            'from'         : '',
            'savestate'    : '0',
            'useticket'    : '1',
            'pagerefer'    :'',
            'pcid'         : pcid,
            'ct'           : '1800',
            's'            : '1',
            'vsnf'         : '1',
            'vsnval'       : '',
            'door'         : '',
            'appkey'       : 'kxR5R',
            'su'           : su,
            'service'      : 'miniblog',
            'servertime'   : servertime,
            'nonce'        : nonce,
            'pwencode'     : 'rsa2',
            'rsakv'        : rsakv,
            'sp'           : sp,
            'sr'           : '1680*1050',
            'encoding'     : 'UTF-8',
            'cdult'        : '2',
            'domain'       : 'weibo.com',
            'prelt'        : '0',
            'returntype'   : 'TEXT',
        })
        headers = {
                   #"Request": "POST /sso/login.php?client=ssologin.js(v1.4.15)&_=1400652171542 HTTP/1.1",
                   #"Accept": "*/*", 
                   "Content-Type": "application/x-www-form-urlencoded",
                   #"Referer": self.CALLBACK_URL,
                   #"Accept-Language": "zh-CN",
                   #"Origin": "https://api.weibo.com",
                   #"Accept-Encoding": "gzip, deflate",
                   #"User-agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; EIE10;ZHCNMSE; rv:11.0) like Gecko",
                   #"Host": "login.sina.com.cn",
                   #"Connection": "Keep-Alive",
                   #"Cache-Control": "no-cache",
                   }
        url = "https://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.15)"
        req = urllib2.Request(url, fields, headers)
        f = urllib2.urlopen(req)
        data = json.loads(f.read())
        return data["ticket"]
    
    def get_code_Security(self): 
        ticket = self.get_ticket()
        fields = urllib.urlencode({
            'action': 'submit', # must
            'display': 'default',
            'withOfficalFlag': '0', # must
            'quick_auth': 'null',
            'withOfficalAccount': '',
            'scope': '',
            'ticket': ticket, # must
            'isLoginSina': '',  
            'response_type': 'code', # must
            'regCallback': 'https://api.weibo.com/2/oauth2/authorize?client_id='+self.APP_KEY+'&response_type=code&display=default&redirect_uri='+self.REDIRECT_URL+'&from=&with_cookie=',
            'redirect_uri': self.REDIRECT_URL, # must
            'client_id': self.APP_KEY, # must
            'appkey62': 'kxR5R',
            'state': '', # must
            'verifyToken': 'null',
            'from': '', # must
            'userId': "", # do not need enter userId
            'passwd': "", # do not need enter password
            })
        LOGIN_URL = 'https://api.weibo.com/oauth2/authorize' 
        headers = {"User-agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; EIE10;ZHCNMSE; rv:11.0) like Gecko",
                   "Referer": self.CALLBACK_URL,
                   "Content-Type": "application/x-www-form-urlencoded",
                   }
        req = urllib2.Request(LOGIN_URL, fields, headers)
        req_ =urllib2.urlopen(req)
        return_redirect_uri = req_.geturl()
        
        print 'return_redirect_uri:', return_redirect_uri
        code = re.findall(r"(?<=code%3D).{32}|(?<=code=).{32}|(?<=code%253D).{32}", return_redirect_uri) # url is formatted with %3D or= 
        return code 
    
    def get_code_NS(self):
        fields = urllib.urlencode({
            'action': 'submit', # must
            'display': 'default',
            'withOfficalFlag': '0', # must
            'quick_auth': 'null',
            'withOfficalAccount': '',
            'scope': '',
            'ticket': '', # must
            'isLoginSina': '',  
            'response_type': 'code', # must
            'regCallback': '',
            'redirect_uri': self.REDIRECT_URL, # must
            'client_id': self.APP_KEY, # must
            'appkey62': 'kxR5R',
            'state': '', # must
            'verifyToken': 'null',
            'from': '', # must
            'userId': self.USER_ID, # must
            'passwd': self.USER_PSWD, # must
            })
        LOGIN_URL = 'https://api.weibo.com/oauth2/authorize' 
        headers = {"User-agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; EIE10;ZHCNMSE; rv:11.0) like Gecko",
                   "Referer": self.CALLBACK_URL,
                   "Content-Type": "application/x-www-form-urlencoded",
                   }
        r = urllib2.Request(LOGIN_URL, fields, headers)
        opener = get_opener(False) 
        urllib2.install_opener(opener)
        try:  
            f = opener.open(r)  
            return_redirect_uri = f.url   
            print "NS1", return_redirect_uri             
        except urllib2.HTTPError, e:  
            return_redirect_uri = e.geturl()  
            print "NS2", return_redirect_uri  
        # get the code
        #code = return_redirect_uri.split('=')[1]
        # re-generate with regexp expression
        code = re.findall(r"(?<=code%3D).{32}|(?<=code=).{32}", return_redirect_uri) 
        print code
        return code 

def weibo_login(): 

    # sina weibo basic secret information
    APP_KEY = u'' # app key
    APP_SECRET = u'' # app secret
    REDIRECT_URL = ''
    USER_NAME = ''
    USER_PASSWD = ''
    
    client = APIClient(app_key=APP_KEY, app_secret=APP_SECRET, redirect_uri=REDIRECT_URL)
    CALLBACK_URL = client.get_authorize_url()
    print 'callback_url:', CALLBACK_URL
    API = SinaAPI(CALLBACK_URL, APP_KEY, REDIRECT_URL, USER_NAME, USER_PASSWD)
    code = API.get_code_Security()
    print 'code:', code
    requests = client.request_access_token(code)  
    access_token = requests.access_token  
    expires_in = requests.expires_in  
    
    # access_token  
    client.set_access_token(access_token, expires_in) 
    return client

if __name__ == '__main__':
    
    # get weibo_api to access sina api
    weibo_api = weibo_login()
    print 'weibo_api:', weibo_api

    # get 200 public weibo
    statuses = weibo_api.statuses.public_timeline.get(count=200)
    print json.dumps(statuses, indent=1) 

Result:

code: ['514b806e619c320b7b1ed85ec0d9880a']
weibo_api: <weibo.APIClient object at 0x027786D0>
{
 "interval": 0, 
 "hasvisible": false, 
 "total_number": 2, 
 "previous_cursor": 0, 
 "next_cursor": 0, 
 "statuses": [
  {
   "reposts_count": 0, 
   "truncated": false, 
   "text": "\u7238\u7238\uff0c\u5c31\u50cf\u8fd9\u4e48\u75bc\u2026\u2026", 
   "visible": {
    "type": 0, 
    "list_id": 0
   }, 
   "in_reply_to_status_id": "", 
   "bmiddle_pic": "http://ww1.sinaimg.cn/bmiddle/dea230dbjw1enp0tkfoy7j20cs281dm7.jpg", 
   "id": 3792648903463407, 
   "thumbnail_pic": "http://ww1.sinaimg.cn/thumbnail/dea230dbjw1enp0tkfoy7j20cs281dm7.jpg", 
   "mid": "3792648903463407", 
   "source": "<a href=\"http://app.weibo.com/t/feed/3auC5p\" rel=\"nofollow\">\u76ae\u76ae\u65f6\u5149\u673a</a>", 
   "attitudes_count": 0, 
   "in_reply_to_screen_name": "", 
   "pic_urls": [
    {
     "thumbnail_pic": "http://ww1.sinaimg.cn/thumbnail/dea230dbjw1enp0tkfoy7j20cs281dm7.jpg"
    }
   ], 
   "in_reply_to_user_id": "", 
   "darwin_tags": [], 
   "favorited": false, 
   "original_pic": "http://ww1.sinaimg.cn/large/dea230dbjw1enp0tkfoy7j20cs281dm7.jpg", 
   "idstr": "3792648903463407", 
   "source_type": 1, 
   "user": {
    "bi_followers_count": 310, 
    "domain": "", 
    "avatar_large": "http://tp4.sinaimg.cn/3735171291/180/40031850543/0", 
    "verified_source": "", 
    "ptype": 0, 
    "statuses_count": 1589, 
    "allow_all_comment": true, 
    "id": 3735171291, 
    "verified_reason_url": "", 
    "city": "1000", 
    "province": "50", 
    "credit_score": 80, 
    "block_app": 0, 
    "follow_me": false, 
    "verified_reason": "", 
    "followers_count": 563, 
    "location": "\u91cd\u5e86", 
    "verified_trade": "", 
    "mbtype": 0, 
    "verified_source_url": "", 
    "profile_url": "u/3735171291", 
    "block_word": 0, 
    "avatar_hd": "http://tp4.sinaimg.cn/3735171291/180/40031850543/0", 
    "star": 0, 
    "description": "", 
    "friends_count": 772, 
    "online_status": 0, 
    "mbrank": 0, 
    "idstr": "3735171291", 
    "profile_image_url": "http://tp4.sinaimg.cn/3735171291/50/40031850543/0", 
    "allow_all_act_msg": false, 
    "verified": false, 
    "geo_enabled": true, 
    "class": 1, 
    "screen_name": "Cz_\u5a77\u7ea6", 
    "lang": "zh-cn", 
    "weihao": "", 
    "remark": "", 
    "favourites_count": 0, 
    "name": "Cz_\u5a77\u7ea6", 
    "url": "", 
    "gender": "f", 
    "created_at": "Thu Aug 22 12:14:23 +0800 2013", 
    "verified_type": -1, 
    "following": false, 
    "pagefriends_count": 0, 
    "urank": 8
   }, 
   "geo": null, 
   "created_at": "Sun Dec 28 07:03:45 +0800 2014", 
   "mlevel": 0, 
   "comments_count": 0
  }, 
  {
   "reposts_count": 0, 
   "truncated": false, 
   "text": "#\u60c5\u4fc2\u963f\u54f2#\u4e70\u4e00\u5f20\u6f14\u5531\u4f1a\u95e8\u7968\uff0c\u5e26\u4e0a\u5c0f\u4f19\u4f34\uff0c\u7a7f\u7740\u4f1a\u670d\u62ff\u7740\u4f60\u7684\u4e13\u8f91\u548c\u71c8\u724c\u5954\u8d74\uff0c\u4e0d\u7528\u79bb\u4f60\u5f88\u8fd1\u54ea\u6015\u5750\u5728\u89d2\u843d\uff0c\u54ea\u6015\u4e0d\u80fd\u770b\u6e05\u53f0\u4e0a\u7684\u4f60\uff0c\u54ea\u6015\u4f60\u4e0d\u77e5\u9053\u4eba\u7fa4\u8fd8\u6709\u4e00\u4e2a\u6211\u3002\u4f46\u53ea\u8981\u80fd\u542c\u89c1\u5e38\u5e38\u5728\u8033\u673a\u91cc\u51fa\u73b0\u7684\u90a3\u4e2a\u58f0\u97f3\uff0c\u53ea\u8981\u80fd\u5750\u5728\u8fd9\u91cc\u4e00\u8d77\u4e3a\u4f60\u5450\u558a\uff0c\u53ea\u8981\u80fd\u548c\u4f60\u7ad9\u5728\u540c\u6837\u7684\u5929\u7a7a\u4e0b\u547c\u5438\u540c\u4e00\u7247\u6c27\u6c14\uff0c\u8fd9\u6837\u5c31\u5df2\u7ecf\u5f88\u597d\u4e86", 
   "visible": {
    "type": 0, 
    "list_id": 0
   }, 
   "in_reply_to_status_id": "", 
   "bmiddle_pic": "http://ww4.sinaimg.cn/bmiddle/5d16267bjw1enp0tkhsxhj20c80klq39.jpg", 
   "id": 3792648903463367, 
   "thumbnail_pic": "http://ww4.sinaimg.cn/thumbnail/5d16267bjw1enp0tkhsxhj20c80klq39.jpg", 
   "mid": "3792648903463367", 
   "source": "<a href=\"http://app.weibo.com/t/feed/4P1GTP\" rel=\"nofollow\">\u6735\u552f\u5973\u6027\u624b\u673a</a>", 
   "attitudes_count": 0, 
   "in_reply_to_screen_name": "", 
   "pic_urls": [
    {
     "thumbnail_pic": "http://ww4.sinaimg.cn/thumbnail/5d16267bjw1enp0tkhsxhj20c80klq39.jpg"
    }
   ], 
   "in_reply_to_user_id": "", 
   "darwin_tags": [], 
   "favorited": false, 
   "original_pic": "http://ww4.sinaimg.cn/large/5d16267bjw1enp0tkhsxhj20c80klq39.jpg", 
   "idstr": "3792648903463367", 
   "source_type": 1, 
   "user": {
    "bi_followers_count": 206, 
    "domain": "", 
    "avatar_large": "http://tp4.sinaimg.cn/1561732731/180/5709043270/0", 
    "verified_source": "", 
    "ptype": 0, 
    "statuses_count": 57711, 
    "allow_all_comment": false, 
    "id": 1561732731, 
    "verified_reason_url": "", 
    "city": "8", 
    "province": "37", 
    "credit_score": 80, 
    "block_app": 0, 
    "follow_me": false, 
    "verified_reason": "", 
    "followers_count": 1487, 
    "location": "\u5c71\u4e1c \u6d4e\u5b81", 
    "verified_trade": "", 
    "mbtype": 0, 
    "verified_source_url": "", 
    "profile_url": "u/1561732731", 
    "block_word": 0, 
    "avatar_hd": "http://ww3.sinaimg.cn/crop.100.0.318.318.1024/5d16267bjw8ellhct6ii9j20ef08ujs6.jpg", 
    "star": 0, 
    "description": "\u672c\u547d\u3001\u7537\u795e\uff1a\u5f35\u4fe1\u54f2/\u5973\u795e\uff1a\u6797\u4f9d\u6668/\u5076\u50cf:\u74ca\u7464\u963f\u59e8/\u559c\u6b61\u7684\u9b54\u8853\u5e2b:\u5289\u8b19/\u559c\u6b61\u53f0\u7063\u5287 \u91d1\u5eb8\u5287/", 
    "friends_count": 951, 
    "online_status": 1, 
    "mbrank": 0, 
    "idstr": "1561732731", 
    "profile_image_url": "http://tp4.sinaimg.cn/1561732731/50/5709043270/0", 
    "allow_all_act_msg": false, 
    "verified": false, 
    "geo_enabled": true, 
    "class": 1, 
    "screen_name": "\u88d9\u89d2\u98db\u63da0326\u4f9d\u5fc3\u54f2\u610f", 
    "lang": "zh-tw", 
    "weihao": "", 
    "remark": "", 
    "favourites_count": 22, 
    "name": "\u88d9\u89d2\u98db\u63da0326\u4f9d\u5fc3\u54f2\u610f", 
    "url": "http://blog.sina.com.cn/qunjiao2quan", 
    "gender": "f", 
    "created_at": "Sun Nov 01 17:38:36 +0800 2009", 
    "verified_type": 220, 
    "following": false, 
    "pagefriends_count": 0, 
    "urank": 23
   }, 
   "geo": null, 
   "created_at": "Sun Dec 28 07:03:45 +0800 2014", 
   "mlevel": 0, 
   "comments_count": 0
  }
 ]
}


新浪微博数据挖掘菜谱之一: 登录篇 (API)

标签:数据挖掘

原文地址:http://blog.csdn.net/guaguastd/article/details/42211273

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!