python 爬虫004-使用urllib2与正则表达式扒取糗事百科新鲜页首页帖子

时间：2017-05-03 17:12:49 阅读：270 评论：0 收藏：0 [点我收藏+]

标签：ace index ... analysis iba open range 保存 pen

面向过程的方式

#!/usr/bin/env python 
# -*- coding: utf-8 -*-

import urllib2
import sys
import re
import os

type = sys.getfilesystemencoding()
if __name__ == ‘__main__‘:
    # 1.访问其中一个网页地址，获取网页源代码
    url = ‘http://www.qiushibaike.com/textnew/‘
    user_agent = ‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36‘
    headers = {‘User-Agent‘: user_agent}
    try:
        req = urllib2.Request(url=url, headers=headers)
        res = urllib2.urlopen(req)
        html = res.read().decode("UTF-8").encode(type)
    except urllib2.HTTPError as e:
        print e
        exit()
    except urllib2.URLError as e:
        print e
        exit()
    # 2.根据抓取到的网页源代码去提取想要的数据，帖子id，帖子内容
    regex_content = re.compile(
        ‘<div class="article block untagged mb15" id=(.*?)>(?:.*?)<div class="content">(.*?)</div>‘,
        re.S)
    items = re.findall(regex_content, html)
    for item in items:
        file_name = item[0].strip(‘\‘‘)
        content = item[1].strip().lstrip(‘<span>‘).rstrip(‘</span>‘).replace(‘\n‘, ‘‘).replace(
            ‘<br/>‘, ‘\n‘)
        # 3.保存抓取的数据到文件中
        path = ‘qiubai‘
        if not os.path.exists(path):
            os.makedirs(path)
        file_path = path + ‘/‘ + file_name + ‘.txt‘
        with open(file_path, ‘w‘) as fp:
            fp.write(content)
            fp.close()

面向对象的方式

#!/usr/bin/env python 
# -*- coding: utf-8 -*-
import urllib2
import re
import os
import sys

type = sys.getfilesystemencoding()


class Spider:
    def __init__(self):
        self.url = ‘http://www.qiushibaike.com/textnew/page/%s/?s=4979315‘
        self.user_agent = ‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36‘

    # 获取网页源代码
    def get_page(self, page_index):
        headers = {‘User-Agent‘: self.user_agent}
        try:
            req = urllib2.Request(url=self.url % str(page_index), headers=headers)
            res = urllib2.urlopen(req)
            html = res.read().decode("UTF-8").encode(type)
            return html
        except urllib2.HTTPError as e:
            print e
            exit()
        except urllib2.URLError as e:
            print e
            exit()

    # 分析网页源代码
    def analysis(self, html):
        regex_content = re.compile(
            ‘<div class="article block untagged mb15" id=(.*?)>(?:.*?)<div class="content">(.*?)</div>‘,
            re.S)
        items = re.findall(regex_content, html)
        return items

    # 保存抓取的数据到文件中
    def save(self, items, path):
        if not os.path.exists(path):
            os.makedirs(path)
        for item in items:
            file_name = item[0].strip(‘\‘‘)
            content = item[1].strip().lstrip(‘<span>‘).rstrip(‘</span>‘).replace(‘\n‘, ‘‘).replace(
                ‘<br/>‘, ‘\n‘)
            file_path = path + ‘/‘ + file_name + ‘.txt‘
            with open(file_path, ‘w‘) as fp:
                fp.write(content)
                fp.close()

    # 运行的方法
    def run(self):
        print u‘开始抓取内容...‘
        for i in range(1, 3):
            content = self.get_page(i)
            items = self.analysis(content)
            self.save(items, ‘qiubai‘)
        print u‘内容抓取完毕...‘


if __name__ == ‘__main__‘:
    sp = Spider()
    sp.run()

python 爬虫004-使用urllib2与正则表达式扒取糗事百科新鲜页首页帖子

标签：ace index ... analysis iba open range 保存 pen

原文地址：http://www.cnblogs.com/guanfuchang/p/6802191.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行