码迷,mamicode.com
首页 > 编程语言 > 详细

Python爬取17吉他网吉他谱

时间:2016-07-01 21:30:03      阅读:205      评论:0      收藏:0      [点我收藏+]

标签:

最近学习吉他,一张一张保存吉他谱太麻烦,写个小程序下载吉他谱。

安装 BeautifulSoup,BeautifulSoup是一个解析HTML的库。
pip install BeautifulSoup4

在这个程序中 BeautifulSoup 使用 html5lib 所以还要安装 html5lib
pip install html5lib

代码如下:

# -*- coding: utf-8 -*-
#coding=UTF8

import os
import sys
import logging
import urllib
import urllib2
import chardet
import re
import cookielib
import urlparse

from bs4 import BeautifulSoup

sysEncoding = sys.getfilesystemencoding()
cookieJar = cookielib.CookieJar()

def get(url):

    req = urllib2.Request(url)
    
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookieJar))
    response = opener.open(req)
    
    return response.read()

def download_guitar_image(url, target):
    
    print start download guitar image ...

    req = urllib2.Request(url)
    req.add_header(Accept,image/webp,image/*,*/*;q=0.8)

    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookieJar))
    response = opener.open(req)
    
    content = response.read()    

    with open(target, wb) as code:
        code.write(content)

#解析吉他谱图片页面链接地址
def parse_guitar_img_link():
    
    page_list = []
    
    url_base = http://www.17jita.com/
    
    page = 1
    while True:
        
        url = url_base + tab/img/index.php?page= + str(page)
        
        print url
        
        html = get(url)
        
        soup = BeautifulSoup(html, "html5lib")
        
        list = soup.select(#ct dl > dt > a)
        
        if not list:
            break
        
        for item in list:
            page_list.append({ title : item.text, link : url_base + item[href] })
                
        page += 1
    
    return page_list    
    
    
def download_guitar_image_link_list(url):
    
    image_link_list = []
    
    page = 1
    
    while True:
    
        page_url = url

        if page > 1:
            page_url = url.replace(.html, ‘‘ + str(page) + .html)
            
        try:
        
            html = get(page_url)
            
            soup = BeautifulSoup(html, html5lib)
            
            img_list = soup.select(#article_contents a > img)
    
            for img in img_list:
                image_link_list.append(img[src])
            
        except urllib2.URLError, e:
            msg = u下载  + page_url + u 出错, 原因:  + e.reason
            print msg
            logging.error(msg)
            break
        
        page += 1

    return image_link_list

if __name__ == __main__:
    
    logging.basicConfig(
        level=logging.DEBUG,
        format=%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s,
        datefmt=%Y-%m-%d %H:%M:%S,
        filename=guitar.log,
        filemode=a)
        
    path = guitar
    if not os.path.exists(path):
        os.mkdir(path)

    page_list = parse_guitar_img_link()
    for page in page_list:
        
        print page[link] + ( + page[title] + )
        
        guitar_path = path + / + (page[title]).encode(GBK)
        if not os.path.exists(guitar_path):
            os.mkdir(guitar_path)
    
        image_link_list = download_guitar_image_link_list(page[link])
        for image_link in image_link_list:
            
            print \t + image_link
            
            filename = image_link[image_link.rindex(/):]
            
            filepath = guitar_path + filename.encode(GBK)
            
            download_guitar_image(image_link, filepath)
    
    
    
    
    
    
    

程序中还存在一些问题尚优化,比如下载中断,不能下载剩下的吉他谱。

Python爬取17吉他网吉他谱

标签:

原文地址:http://www.cnblogs.com/wuliqv/p/5634259.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!