初学 Python, 结合网络资料整理
#!/usr/bin/python #coding=UTF-8 ‘‘‘ 爬虫 v 1.0 (L) ‘‘‘ import time import sys import re #==================================== # 根据 Python 的版本选择需要加载的模块 #==================================== if (sys.version)[0] == ‘3‘: import urllib.request as urlreq import _thread as thread else: import urllib2 as urlreq import thread as thread class Spider: def __init__(self): self.pageidx = 1 # 标记页码 self.items = [] # 存放正文内容 self.read = False # 存放是否继续 True:继续; False:停止 self.restr = ‘‘ self.user_agent = [‘Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)‘, ‘‘, ‘‘] # UA 列表 self.header = { ‘User-Agent‘ : (self.user_agent)[0] }# 存放浏览器UA(防止网站限定header) #========================================================= # 功能: 获取网页内容(按规则) # 参数: self; url:页面地址; recom:正则 #========================================================= def GetItems(self, url, recom): request = urlreq.Request(url, headers = self.header) html = urlreq.urlopen(request).read() # 页面编码转换 (encode: unicode->其它; decode: 其它->unicode) uhtml = html.decode("UTF-8") # 根据规则从页面选定内容; return re.findall(recom, uhtml) #========================================================= # 功能: 加载网页内容 # 参数: self; #========================================================= def LoadItems(self, url, recom): while self.read: # 判断当items中的内容不足两条时,请求下一页数据并存入items列表中 if len(self.items) < 2: try: uri = url.replace("$PX$", str(self.pageidx)) item = self.GetItems(uri, recom) self.items.append(item) self.pageidx += 1 except: print(‘[ ‘ + uri + ‘ ] 被外星人劫持[访问失败]....‘) self.read = False else: time.sleep(1) #========================================================= # 功能: 现实网页内容(并接收用户输入) # 参数: self; #========================================================= def ShowItems(self, items, pidx = 1): for item in items: print(u‘[%d]‘ % pidx, item[1], ‘\n‘, item[0].replace(‘\n‘, ‘‘).replace(‘<br/>‘, ‘\n‘)) print(‘-‘ * 70) option = input().upper() if option == ‘Q‘: self.read = False return 9 elif option == ‘R‘: self.pageidx = 1 # 初始值 return 0 return 1 #========================================================= # 功能: 现实网页内容(并接收用户输入) # 参数: self; #========================================================= def Launcher(self, url, recom): self.read = True pidx = self.pageidx print(u‘正在加载中请稍候......\n‘, ‘=‘ * 70) thread.start_new_thread(self.LoadItems, (url, recom)) while self.read: if self.items: item = self.items[0] del self.items[0] ret = self.ShowItems(item, pidx) if ret == 9: print(u‘即将推出阅读....‘) break else: pidx += ret def usage(): print(u""" --------------------------------------------- - 程序:糗百爬虫(阅读文章) - 版本:0.1 - 作者:L - 日期:2015-06-30 - 语言:Python 3.3.5 - 操作:Q:退出阅读; R:刷新到第一页; 回车:阅读 - 功能:按下回车依次浏览今日的糗百热点 --------------------------------------------- """) if __name__ == "__main__": usage() url = ‘http://m.qiushibaike.com/hot/page/$PX$‘ # re.S是任意匹配模式(如:匹配换行符) recom = re.compile(r‘<div class="content">(.*?)<!--(.*?)-->.*?</div>‘, re.S) input(‘浏览今日的糗百内容(任意键):‘) spider = Spider() spider.Launcher(url, recom) print(‘欢迎再次使用....‘)
原文地址:http://chnjone.blog.51cto.com/4311366/1669508