码迷,mamicode.com
首页 > Web开发 > 详细

爬虫实例——爬取淘女郎相册(通过selenium、PhantomJS、BeautifulSoup爬取)

时间:2016-06-06 16:27:58      阅读:311      评论:0      收藏:0      [点我收藏+]

标签:

环境

操作系统:CentOS 6.7 32-bit

Python版本:2.6.6

第三方插件

selenium

PhantomJS

BeautifulSoup

代码

# -*- coding: utf-8 -*-
import sys
reload(sys)
sys.setdefaultencoding(utf-8)
‘‘‘
作者:昨夜星辰
‘‘‘
import re
import os
import time
import shutil
import requests
import subprocess
from bs4 import BeautifulSoup
from selenium import webdriver

# 拼接url
def joint_url(string):
    return https: + string

# 判断文件夹是否存在,如果存在就删除,否则就创建。
def create_folder(path):
    if os.path.exists(path):
        if os.path.isdir(path):
            shutil.rmtree(path)
        else:
            os.remove(path)
    os.mkdir(path)

root_folder = 淘女郎
create_folder(root_folder)
url = https://mm.taobao.com/json/request_top_list.htm?page=1
browser = webdriver.PhantomJS()
browser.get(url)
bs = BeautifulSoup(browser.page_source, lxml)
for top in bs(p, top):
    mm_url = joint_url(top.find(a)[href])
    mm_name = top.find(a).text
    mm_age = top.find(em).text
    mm_city = top.find(span).text
    mm_folder = %s/%s % (root_folder, mm_name)
    create_folder(mm_folder)
    print 发现一位美眉,她叫做%s,今年%s,住在%s,现在开始爬取她的个人页面…… % (mm_name, mm_age, mm_city)
    browser.get(mm_url)
    bs1 = BeautifulSoup(browser.page_source, lxml)
    base_info = bs1.find(ul, mm-p-info-cell clearfix)
    info_list = base_info(span)
    result = []
    result.append(昵称: + info_list[0].text)
    result.append(生日: + info_list[1].text.strip())
    result.append(所在城市: + info_list[2].text)
    result.append(职业: + info_list[3].text)
    result.append(血型: + info_list[4].text)
    result.append(学校/专业: + info_list[5].text)
    result.append(风格: + info_list[6].text)
    result.append(身高: + base_info.find(li, mm-p-small-cell mm-p-height).find(p).text)
    result.append(体重: + base_info.find(li, mm-p-small-cell mm-p-weight).find(p).text)
    result.append(三围: + base_info.find(li, mm-p-small-cell mm-p-size).find(p).text)
    result.append(罩杯: + base_info.find(li, mm-p-small-cell mm-p-bar).find(p).text)
    result.append(鞋码: + base_info.find(li, mm-p-small-cell mm-p-shose).find(p).text)
    print 资料收集完毕,正在保存她的个人资料……
    filename = %s/%s.txt % (mm_folder, mm_name)
    with open(filename, w) as f:
        f.write(\r\n.join(result))
    print 保存完毕!现在开始爬取她的个人相册……
    album_menu_url = joint_url(bs1.find(ul, mm-p-menu).find(a)[href])
    browser.get(album_menu_url)
    time.sleep(3)
    bs2 = BeautifulSoup(browser.page_source, lxml)
    album_number = 1
    for album_info in bs2(div, mm-photo-cell-middle):
        album_url = joint_url(album_info.find(h4).find(a)[href])
        album_name = album_info.find(h4).find(a).text.strip()
        album_size = album_info.find(span, mm-pic-number).text
        print 现在开始爬取她的第%d个相册,相册名为:《%s》%s…… % (album_number, album_name, album_size)
        browser.get(album_url)
        js1 = return document.body.scrollHeight
        js2 = window.scrollTo(0, document.body.scrollHeight)
        old_scroll_height = 0
        while(browser.execute_script(js1) > old_scroll_height):
            old_scroll_height = browser.execute_script(js1)
            browser.execute_script(js2)
            time.sleep(3)
        bs3 = BeautifulSoup(browser.page_source, lxml)
        photo_number = 1
        for photo_area in bs3(div, mm-photoimg-area):
            print 现在开始下载她这个相册的第%d张图片…… % photo_number,
            photo_url = joint_url(photo_area.find(a)[href])
            browser.get(photo_url)
            bs4 = BeautifulSoup(browser.page_source, lxml)
            big_img_url = joint_url(bs4.find(img, id=J_MmBigImg)[src])
            content = requests.get(big_img_url).content
            filename = %s/%d.jpg % (mm_folder, photo_number)
            with open(filename, wb) as f:
                f.write(content)
            print 下载完毕!
            photo_number += 1
        album_number += 1

 

爬虫实例——爬取淘女郎相册(通过selenium、PhantomJS、BeautifulSoup爬取)

标签:

原文地址:http://www.cnblogs.com/yestreenstars/p/5564025.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!