码迷,mamicode.com
首页 > 编程语言 > 详细

Python爬虫实战(三):爬网易新闻

时间:2015-11-28 21:37:59      阅读:334      评论:0      收藏:0      [点我收藏+]

标签:

代码:

# _*_ coding:utf-8 _*_
import urllib2
import re
#import sys

#reload(sys)
#sys.setdefaultencoding(‘utf-8‘)
    
class Tool:
    removeImg = re.compile(r<p class="f_center".*?</p>)
    removeAddr = re.compile(r<a.*?>|</a>)
    replaceLine = re.compile(r<tr>|<div>|</div>|</p>)
    replaceTD = re.compile(r<td>)
    replacePara = re.compile(r<p.*?>)
    replaceBR = re.compile(r<br<br>|<br>)
    removeExtraTag = re.compile(r<.*?>)

    def replace(self,text):
        text = re.sub(self.removeImg,"",text)
        text = re.sub(self.removeAddr,"",text)
        text = re.sub(self.replaceLine,"\n",text)
        text = re.sub(self.replaceTD,"\t",text)
        text = re.sub(self.replacePara,"\n"+"  ",text)
        text = re.sub(self.replaceBR,"\n",text)
        text = re.sub(self.removeExtraTag,"",text)
        return text.strip()
        

class WYXW:
    def __init__(self,baseUrl):
        self.baseURL = baseUrl
        self.user_agent = Mozilla/4.0 (compatible;MSIE 5.5; Windows NT)
        self.headers = {User-Agent:self.user_agent}
        #self.file = None
        self.fileName = u网易新闻
        self.tool = Tool()

    def get_homepage(self):
        url = self.baseURL
        request = urllib2.Request(url,headers = self.headers)
        response = urllib2.urlopen(request)
        content = response.read().decode(utf-8,ignore)
        #print content#.encode(‘gbk‘,‘ignore‘)
        return content

    def extract_url(self,homepage):
        pattern = "http://news.163.com/\d{2}/\d{4}/\d{2}/\w{16}.html"
        news_url = re.findall(pattern,homepage)
        #print news_url
        return news_url

    def extract_sub_web_time(self,sub_web):
        pattern = re.compile(r\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},re.S)
        time = re.findall(pattern,sub_web)
        print time[0]
        return time[0]

    def extract_sub_web_source(self,sub_web):
        pattern = re.compile(r<a id="ne_article_source".*?>(.*?)</a>)
        source = re.findall(pattern,sub_web)
        print source[0]
        return source[0]

    def extract_sub_web_title(self,sub_web):
        #pattern = "<title>.+</title>"
        #pattern = ‘<h1 id="h1title" class="ep-h1">(.*?)</h1>‘
        pattern = re.compile(r<h1 id="h1title" class="ep-h1">(.*?)</h1>,re.S)
        title = re.findall(pattern,sub_web)
        if title is not None:
            print title[0]
            return title[0]
        else:
            return None

    def extract_sub_web_content(self,sub_web):
        #pattern = "<div id=\"Cnt-Main-Article-QQ\".*</div>"
        pattern = re.compile(r<div id="endText".*?>(.*?)<!.*?-->,re.S)
        content = re.findall(pattern,sub_web)
        #print content[0]
        if content is not None:
            return content[0]
        else:
            return None

    def writeData(self,fName):
        if fName is not None: 
            file = open(fName + .txt,"w+")
        else:
            file = open(self.fileName + .txt,"w+")
        homepage = self.get_homepage()
        news_urls = self.extract_url(homepage)
        for url in news_urls:
            print url
            web = urllib2.urlopen(url).read()
            title = self.extract_sub_web_title(web).strip()
            content = self.extract_sub_web_content(web)
            time = self.extract_sub_web_time(web).strip()
            source = self.extract_sub_web_source(web).strip()
            if content is not None:
                content = self.tool.replace(content)
                news = title + "\n\n" + time + "\t" + source + "\n\n" + content + "\n"
                file.write(news)
                sep = "\n" + "-------------------------------------------------------------------------" + "\n"
                file.write(sep)
                print u"新闻写入成功" + "\n"
  

baseUrl = "http://news.163.com"
wyxw = WYXW(baseUrl)
wyxw.writeData(None)

 

Python爬虫实战(三):爬网易新闻

标签:

原文地址:http://www.cnblogs.com/AndyJee/p/5003385.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!