html2doc

时间：2017-10-28 19:52:51 阅读：1445 评论：0 收藏：0 [点我收藏+]

标签：error: rom headers 就是 oct mob 定位 _id agent

1.参考

Python 爬虫：把廖雪峰教程转换成 PDF 电子书

https://github.com/lzjun567/crawler_html2pdf

wkhtmltopdf 就是一个非常好的工具，它可以用适用于多平台的 html 到 pdf 的转换，pdfkit 是 wkhtmltopdf 的Python封装包。

https://www.crummy.com/software/BeautifulSoup/bs4/doc/#

也可以通过 BeautifulSoup 插入删除tag

soup.insert
soup.decompose

2.安装

https://wkhtmltopdf.org/downloads.html

下载版本 Windows (MinGW) 0.12.4 32-bit / 64-bit for Windows XP/2003 or later; standalone

添加路径 D:\Program Files\wkhtmltopdf\bin

需要重新打开cmd以及notepad++。。。

pip install pdfkit

API https://pypi.python.org/pypi/pdfkit

定制options，搜索关键字 https://wkhtmltopdf.org/usage/wkhtmltopdf.txt

options = {
    ‘page-size‘: ‘Letter‘,
    ‘margin-top‘: ‘0.75in‘,
    ‘margin-right‘: ‘0.75in‘,
    ‘margin-bottom‘: ‘0.75in‘,
    ‘margin-left‘: ‘0.75in‘,
    ‘encoding‘: "UTF-8",  #支持中文
    ‘custom-header‘ : [
        (‘Accept-Encoding‘, ‘gzip‘)
    ]
    ‘cookie‘: [
        (‘cookie-name1‘, ‘cookie-value1‘),
        (‘cookie-name2‘, ‘cookie-value2‘),
    ],
    ‘no-outline‘: None
}

pdfkit.from_url(‘http://google.com‘, ‘out.pdf‘, options=options)

3.背景知识

3.1url 相对路径绝对路径

In [323]: urlparse.urljoin(‘https://doc.scrapy.org/en/latest/index.html‘, ‘intro/overview.html‘)  #相当于 ./intro/overview.html，其中 . 指代当前文件夹 latest
Out[323]: ‘https://doc.scrapy.org/en/latest/intro/overview.html‘

In [324]: urlparse.urljoin(‘https://doc.scrapy.org/en/latest/intro/overview.html‘, ‘#walk-through-of-an-example-spider‘)  #当前网页某个tag id=walk-through-of-an-example-spider
Out[324]: ‘https://doc.scrapy.org/en/latest/intro/overview.html#walk-through-of-an-example-spider‘

In [326]: urlparse.urljoin(‘https://doc.scrapy.org/en/latest/intro/overview.html‘, ‘install.html‘)  #相当于 ./install.html
Out[326]: ‘https://doc.scrapy.org/en/latest/intro/install.html‘

In [327]: urlparse.urljoin(‘https://doc.scrapy.org/en/latest/intro/overview.html‘, ‘../topics/commands.html‘)  # .. 指代当前文件夹intro的上一层文件夹latest
Out[327]: ‘https://doc.scrapy.org/en/latest/topics/commands.html‘

https://doc.scrapy.org/en/latest/index.html

这一类官方文档一般页脚都为：

Built with Sphinx using a theme provided by Read the Docs.

3.2页面布局规律

点击左上角 home 图标转到首页
左边栏页面导航
- <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
  - <a class="reference internal" href="intro/overview.html">Scrapy at a glance</a></li>
正文主体
- <div class="rst-content">

3.2转换pdf时注意事项

提取正文主体后，可以直接将 <div xxxx </div> 保存html，不需要补全 <html>
图片链接相对路径需要转换为绝对路径，才会自动加载图片
- <img alt="Inspecting elements with Firebug" src="../_images/firebug1.png" style="width: 913px; height: 600px;">
pdfkit.from_file 第一个参数 input 为 html文件路径列表，文件名不能是中文。。。
- pdfkit.from_file(self.htmls_saved, self.netloc+‘.pdf‘, options=options)
pdf会根据<h1> <h2>等标题 tag 自动生成目录

4.实践代码

#!usr/bin/env python
#coding:utf-8

import os
import sys
import traceback
import re
import urlparse
import threading
import Queue

import requests
from scrapy import Selector
import pdfkit


s = requests.Session()
# s.headers.update({‘user-agent‘:‘Mozilla/5.0 (iPhone; CPU iPhone OS 9_3_5 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Mobile/13G36 MicroMessenger/6.5.12 NetType/4G‘})
s.headers.update({‘user-agent‘:‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36‘})
# s.headers.update({‘Referer‘:‘https://servicewechat.com/wx55b926152a8c3bef/14/page-frame.html‘})
s.verify = False
s.mount(‘https://‘, requests.adapters.HTTPAdapter(pool_connections=1000, pool_maxsize=1000)) 
import copy
sp = copy.deepcopy(s)
proxies = {‘http‘: ‘http://127.0.0.1:1080‘, ‘https‘: ‘https://127.0.0.1:1080‘}
sp.proxies = proxies 

from urllib3.exceptions import InsecureRequestWarning
from warnings import filterwarnings
filterwarnings(‘ignore‘, category = InsecureRequestWarning)

html_template = u"""
<!DOCTYPE html>

<html>
    <head>
        <meta charset="utf-8" />
        <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
    </head>
    <body>
        <!-- <center><h1>{title}</h1></center> -->
        {content}
    </body>
</html>
"""

# https://wkhtmltopdf.org/usage/wkhtmltopdf.txt
options = {
    ‘page-size‘: ‘A4‘,  # Letter
    ‘minimum-font-size‘: 25,  ###
    # ‘image-dpi‘:1500, ###
    
    ‘margin-top‘: ‘0.1in‘,  #0.75in
    ‘margin-right‘: ‘0.1in‘,
    ‘margin-bottom‘: ‘0.1in‘,
    ‘margin-left‘: ‘0.1in‘,
    ‘encoding‘: ‘UTF-8‘,  #支持中文
    ‘custom-header‘: [
        (‘Accept-Encoding‘, ‘gzip‘)
    ],
    ‘cookie‘: [
        (‘cookie-name1‘, ‘cookie-value1‘),
        (‘cookie-name2‘, ‘cookie-value2‘),
    ],
    ‘outline-depth‘: 10,
}


class HTMLtoPDF(object):

    def __init__(self, seed_url, font_size=25, css_links=‘div[class="wy-menu wy-menu-vertical"] a::attr(href)‘,
                css_content=‘div.rst-content‘, threads_count=30):
        self.seed_url = seed_url
        options[‘minimum-font-size‘] = font_size
        
        self.netloc = urlparse.urlparse(seed_url).netloc 
        print self.netloc
        self.folder = os.path.join(sys.path[0], self.netloc)
        self.folder_temp = os.path.join(sys.path[0], ‘temp‘)
        for f in [self.folder, self.folder_temp]:
            if not os.path.isdir(f):
                os.mkdir(f)
        
        self.css_content = css_content
        self.css_links = css_links
        
        self.threads_count = threads_count
        # self.lock = threading.Lock()
        self.links_queue = Queue.Queue()     
        
        self.links_queue.put((‘0‘, self.seed_url))
        self.get_links()
        self.htmls_saved = [] 
        
        # 验证 re.findall(pattern, s)
        # <img alt="_images/chapt3_img05_IDE_open.png" class="align-center" src="_images/chapt3_img05_IDE_open.png">
        self.img_scr_pattern = re.compile(r‘(<img\s+[^>]*?src\s*=\s*")(?P<src>.*?)(".*?>)‘)  #最后不能简写成 " ，否则结果缺 "

        
        # <a class="reference external" href="http://code.google.com/p/selenium/issues/detail?id=1008">issue 1008</a>
        # text为空，也能匹配到 m.group(4)=‘‘
        self.a_href_pattern = re.compile(r‘(<a\s+[^>]*?href\s*=\s*")(?P<href>.*?)(".*?>)(?P<text>.*?)(</a>)‘)
        
        # http://www.seleniumhq.org/docs/ 合体。。。text为 <img alt="openqa.org logo" id="footerLogo" src="/images/openqa-logo.png"/>
        # <a href="http://openqa.org/"><img alt="openqa.org logo" id="footerLogo" src="/images/openqa-logo.png"/></a>

        
    def get_links(self):
        text = self.load_page(self.seed_url)
        sel = Selector(text=text)
        
        # [u‘#selenium-documentation‘,
        # u‘00_Note_to-the-reader.jsp‘,
        # u‘01_introducing_selenium.jsp‘,
        # u‘01_introducing_selenium.jsp#test-automation-for-web-applications‘,    

        # links = [re.sub(r‘#.*$‘,‘‘, i) for i in sel.css(‘div[class="toctree-wrapper compound"] a::attr(href)‘).extract()]        
        links = [re.sub(r‘#.*$‘,‘‘, i) for i in sel.css(self.css_links).extract()]        
        links_seen = [self.seed_url]
        
        for link in links:  #set(links) 会导致乱序,使用urls_seen 去重
            link_abs = urlparse.urljoin(self.seed_url, link)
            if link_abs not in links_seen:
                self.links_queue.put((str(len(links_seen)), link_abs))
                links_seen.append(link_abs)

    def save_html(self):
        while True:
            try:
                (num, url) = self.links_queue.get()
                text = self.load_page(url)
                
                title, content = self.parse_page(url, text)
                
                filename_cn = u‘{}_{}.html‘.format(num, re.sub(ur‘[^\u4e00-\u9fa5\w\s()_-]‘, ‘‘, title))  #ur    
                filename = u‘{}_{}.html‘.format(num, re.sub(r‘[^\w\s()_-]‘, ‘‘, title))  #os.path.abspath(‘en/abc.html‘)合成路径 不能是 /en。。
                
                with open(os.path.join(self.folder, filename_cn),‘wb‘) as fp:
                    fp.write(text.encode(‘utf-8‘,‘replace‘))
                f = os.path.join(self.folder_temp, filename)
                with open(f,‘wb‘) as fp:
                    fp.write(content.encode(‘utf-8‘,‘replace‘)) 
                    # fp.write(html_template.format(content=content, title=title).encode(‘utf-8‘,‘replace‘))
                    self.htmls_saved.append(f)
                    print ‘{}/{}‘.format(len(self.htmls_saved), self.links_queue.qsize())
                    
                self.links_queue.task_done()
            except Exception as err:
                print ‘{} {} {}‘.format(url, err, traceback.format_exc())
                
    def run(self):
        threads = []
        for i in range(self.threads_count):
            t = threading.Thread(target=self.save_html)
            threads.append(t)

        for t in threads:
            t.setDaemon(True) 
            t.start() 
            
        self.links_queue.join()
        print ‘load done‘
        
        def func(filename):
            _, filename =os.path.split(filename)
            return int(filename[:filename.index(‘_‘)])
        
        self.htmls_saved.sort(key=lambda x:func(x))
        pdfkit.from_file(self.htmls_saved, self.netloc+‘.pdf‘, options=options)
        print self.netloc, ‘pdf done‘
        

    def load_page(self, url):
            
        resp = sp.get(url)  ###############

        if resp.encoding == ‘ISO-8859-1‘:
            encodings = requests.utils.get_encodings_from_content(resp.content)  #re.compile(r‘<meta.*?charset
            if encodings:
                resp.encoding = encodings[0]
            else:
                resp.encoding = resp.apparent_encoding  #models.py  chardet.detect(self.content)[‘encoding‘]
            # print ‘ISO-8859-1 changed to %s‘%resp.encoding
            
        return resp.text   

    def parse_page(self, url, text):
        sel = Selector(text=text)

        title = sel.css(‘head title::text‘).extract_first() or ‘‘  #固定css
        content = sel.css(self.css_content).extract_first() or ‘‘  #‘div.rst-content‘
        
        # sel = sel.css("div#rst-content")[0]  ###缩小范围
        content = self.clean_content(content)
        content = self.modify_content(url, content)  
        
        return title, content
  
    def clean_content(self, content):
        sel = Selector(text=content)
        # content = content.replace(sel.css(‘div#codeLanguagePreference‘).extract_first(), ‘‘) #可能是None
        for div in sel.css(‘div#codeLanguagePreference‘).extract():
            content = content.replace(div, ‘‘)
        
        for lang in [‘java‘, ‘csharp‘, ‘ruby‘, ‘php‘, ‘perl‘, ‘javascript‘]:
            for div in sel.css(‘div.highlight-%s‘%lang).extract():
                # print len(content)
                content = content.replace(div, ‘‘)
                
        return content
        
    def modify_content(self, url, content):
        # m.group(1)=‘abc‘ SyntaxError: can‘t assign to function call 不能直接赋值
        
        # https://doc.scrapy.org/en/latest/topics/firebug.html
        # ../_images/firebug1.png
        # 异常 urlparse.urljoin(self.seed_url, src)
        
        # r‘(<img\s+[^>]*?src\s*=\s*")(?P<src>.*?)(".*?>)‘
        def func_src(m):
            src = m.group(‘src‘)  #别名
            if not src.startswith(‘http‘):
                src = urlparse.urljoin(url, src)
            return u‘{}{}{}‘.format(m.group(1), src, m.group(3))

        content = re.sub(self.img_scr_pattern, func_src, content)

        
        # re.compile(r‘(<a\s+[^>]*?href\s*=\s*")(?P<href>.*?)(".*?>)(?P<text>.*?)(</a>)‘)
        def func_href(m):
            href = m.group(‘href‘)
            text = m.group(‘text‘)
            if not href.startswith(‘#‘):
                if not href.startswith(‘http‘):
                    href = urlparse.urljoin(url, href)
                text = u‘{text} ({href})‘.format(text=text, href=href)
            return u‘{g1}{href}{g3}{text}{g5}‘.format(g1=m.group(1), g3=m.group(3), g5=m.group(5), href=href, text=text)
            #m.string是content全文。。。也不能 return m

        content = re.sub(self.a_href_pattern, func_href, content)   

        return content
        
    
    def modify_content2(self, url, content):
        sel = Selector(text=content)

        # 修改图片链接为绝对链接，否则pdf无法图片        
        # <img alt="_images/chapt3_img05_IDE_open.png" class="align-center" src="_images/chapt3_img05_IDE_open.png">
        for i in sel.css(‘img[src]‘):
            tag = i.extract()
            src = i.xpath(‘./@src‘).extract_first()
            if not src.startswith(‘http‘):
                src_abs = urlparse.urljoin(url, src)
                # print src, src_abs
                tag_new = tag.replace(src, src_abs)     
                content = content.replace(tag, tag_new)  #可能alt(同src...)
                
        # a href 的text添加href信息
        # <a class="reference external" href="http://code.google.com/p/selenium/issues/detail?id=1008">issue 1008</a>
        for i in sel.css(‘a[href]‘):
            tag = i.extract()
            href = i.xpath(‘./@href‘).extract_first()
            text = i.xpath(‘./text()‘).extract_first()
            
            # 补全内部链接，忽略本页面的#定位
            if not href.startswith(‘http‘) and not href.startswith(‘#‘):
                href_abs = urlparse.urljoin(url, href)
                # print href, href_abs
                tag_new = tag.replace(href, href_abs)
            else:
                href_abs = href
                tag_new = tag
                
            # 图标链接，如果text为None，replace表现异常
            if text and not href.startswith(‘#‘):
                text_new = u‘{} ({})‘.format(text, href_abs)
                # print text.encode(‘gbk‘,‘replace‘), text_new.encode(‘gbk‘,‘replace‘)
                tag_new = tag_new.replace(text, text_new)          
            
            # 保证整体替换   
            content = content.replace(tag, tag_new)  
        
        return content



if __name__ == ‘__main__‘:
    url = ‘https://doc.scrapy.org/en/latest/index.html‘
    # obj = HTMLtoPDF(url)  
    
    url = ‘http://python3-cookbook.readthedocs.io/zh_CN/latest/index.html‘
    # obj = HTMLtoPDF(url, font_size=20, css_links=‘div[class="toctree-wrapper compound"] a::attr(href)‘) 
    
    url = ‘http://www.seleniumhq.org/docs/‘
    obj = HTMLtoPDF(url, css_links=‘div#selenium-documentation a::attr(href)‘, css_content=‘div#mainContent‘)  
    
    obj.run()