# -*- coding: utf-8 -*- # ----------------------------------------------- # 程序:豆瓣小组图片爬虫 # 版本:1.0 # 语言:Python 3.4 # 作者:gdp12315 # 操作:输入豆瓣小组讨论版块地址、起始页面、终止页面 # 功能:下载小组帖子里发布的图片 # 注意:下载的保存地址为作者本机地址 读者根据自身情况更改 # ----------------------------------------------- import random import socket import http.cookies import http.cookiejar import urllib.request,re,time ERROR = { '0':'Can not open the url,checck you net', '1':'Creat download dir error', '2':'The image links is empty', '3':'Download faild', '4':'Build soup error,the html is empty', '5':'Can not save the image to your disk', } class BrowserBase(object): def __init__(self): socket.setdefaulttimeout(20) def speak(self,name,content): print('[%s]%s', name,content) def openurl(self,url): """ 打开网页 """ cookie_support= urllib.request.HTTPCookieProcessor(http.cookiejar.CookieJar()) self.opener = urllib.request.build_opener(cookie_support,urllib.request.HTTPHandler) urllib.request.install_opener(self.opener) user_agents = [ 'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv: Gecko/20071127 Firefox/', 'Opera/9.25 (Windows NT 5.1; U; en)', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)', 'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv: Gecko/20070731 Ubuntu/dapper-security Firefox/', 'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9', "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7", "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0 ", ] agent = random.choice(user_agents) self.opener.addheaders = [("User-agent",agent),("Accept","*/*"),('Referer',url)] try: res = #print( except Exception as e: self.speak(str(e),url) raise Exception else: return res if __name__=='__main__': splider=BrowserBase() # ------------ begin ---------------------------- # 输入示例 # # 1 # 2 #print('请输入豆瓣小组地址,去掉start=后面的数字') url = str(input(u'请输入豆瓣小组地址,去掉start=后面的数字:\n')) #url = '' page_bgn = int(input(u'请输入开始时的页码:\n')) page_end = int(input(u'请输入结束时的页码:\n')) num_end = (page_end-1)*25 num_now = (page_bgn-1)*25 while num_now <= num_end: # 获得主题列表页面 html_topic_list = splider.openurl(url+str(num_now)).read().decode('utf-8') # 获得主题列表 re_topic_list = re.compile(r'http://www\.douban\.com/group/topic/\d+') topic_list = re_topic_list.findall(html_topic_list) # 遍历每个主题 将其中图片下载下来 for topic_url in topic_list: print('topic_url '+topic_url) html_topic = splider.openurl(topic_url).read().decode('utf-8') # 进入主题 获得图片下载地址列表(图片可能有多张) re_img_list = re.compile(r'http://img\d\.douban\.com/view/group_topic/large/public/.+\.jpg') img_list = re_img_list.findall(html_topic) # 遍历图片下载地址列表 把每张图片保存到对应位置 for img_url in img_list: print('img_url: '+img_url) img_name = re.findall(r'p\d{7}',img_url) download_img = urllib.request.urlretrieve(img_url,'D:\Python\pics\%s.jpg'%img_name) time.sleep(2) num_now = num_now + 25 else: print('采集完成!')