码迷,mamicode.com
首页 > 其他好文 > 详细

爬虫爬oj用户头像

时间:2016-04-27 22:30:48      阅读:238      评论:0      收藏:0      [点我收藏+]

标签:

技术分享
import requests
import Queue
import urllib
import urllib2
import re
import requests
alreadyImg = set()
s = requests.session()
s.post("http://acm.hrbust.edu.cn/index.php?m=User&a=login"
, data={
    "user_name": "1304020306",
    "password": "123456"
})
r = s.get("http://acm.hrbust.edu.cn/index.php?m=User&a=userInfo&user_name=1404020214")
print r.text
urllist = Queue.Queue(maxsize = -1)
already = set()
url = "http://acm.hrbust.edu.cn/index.php?m=Ranklist&a=showRatingrank"
urllist.put(url)
reg = ra href="(.+?)"
httpre = re.compile(reg)
#reg = r‘src="(.+?\.jpg)"‘
reimg = rimg class="large_avatar" src="([^>]+?\.(png|jpg))>?"
imgre = re.compile(reimg)
def putUrl(html):
    httplist = re.findall(httpre, html)
    for url in httplist:
        realurl = url
        if http not in url:
            realurl = "http://acm.hrbust.edu.cn/"+url
        #print realurl
        if url not in already:
            already.add(url)
            urllist.put(realurl)
x = 0;
def getImg(html):
    Imglist = re.findall(imgre, html)
    global x
    for Img in Imglist:
        Img = Img[0]
        if Img in alreadyImg:
            continue
        else:
            alreadyImg.add(Img)
        print Img
        if Img[0] != h:
            Img = "http://acm.hrbust.edu.cn/" + Img
        #print "Img == " +Img
        try:
            urllib.urlretrieve(Img, C:/%s.jpg % x)
        except urllib2.URLError, e:
            pass
        else:
            #print "http://acm.hrbust.edu.cn/"+Img
            x += 1
while True != urllist.empty():
    url = urllist.get(urllist)
    print url
    try:
        r = s.get(url)
        html = r.text
        if "index.php?m=Ranklist&a=showRatingrank" in url:
            putUrl(html)
        getImg(html)
    except urllib2.URLError, e:
        pass
    except urllib2.HTTPError, e:
        pass
    else:
        pass
    
    #else:
    #    print url
    #print html
    #break
View Code

 

爬虫爬oj用户头像

标签:

原文地址:http://www.cnblogs.com/icodefive/p/5440455.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!