Python爬虫与mysql

时间：2016-02-17 12:52:46 阅读：458 评论：0 收藏：0 [点我收藏+]

标签：

爬虫基础及正则表达式：http://blog.csdn.net/gzh0222/article/details/12647723

爬虫实战及进阶：http://www.cnblogs.com/xin-xin/p/4297852.html

其他网络资料：http://www.crifan.com/files/doc/docbook/python_topic_web_scrape/release/html/python_topic_web_scrape.html

　　　　　　 http://www.crifan.com/files/doc/docbook/web_scrape_emulate_login/release/html/web_scrape_emulate_login.html

Python与数据库：http://www.cnblogs.com/fnng/p/3565912.html

以下是爬糗事百科段子的Python源码

软件：Python2.5

系统：win7

  1 # -*- coding: utf-8 -*-    
  2      
  3 import urllib2    
  4 import urllib    
  5 import re    
  6 import thread    
  7 import time    
  8   
  9     
 10 #----------- 加载处理糗事百科 -----------    
 11 class Spider_Model:    
 12         
 13     def __init__(self):    
 14         self.page = 1
 15         self.count = 1
 16         self.pages = []    
 17         self.enable = False    
 18     
 19     # 将所有的段子都扣出来，添加到列表中并且返回列表    
 20     def GetPage(self,page):    
 21         myUrl = "http://m.qiushibaike.com/hot/page/" + page    
 22         user_agent = ‘Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)‘   
 23         headers = { ‘User-Agent‘ : user_agent }   
 24         req = urllib2.Request(myUrl, headers = headers)   
 25         myResponse = urllib2.urlopen(req)  
 26         myPage = myResponse.read()    
 27         #encode的作用是将unicode编码转换成其他编码的字符串    
 28         #decode的作用是将其他编码的字符串转换成unicode编码    
 29         unicodePage = myPage.decode("utf-8")    
 30     
 31         # 找出所有class="content"的div标记    
 32         #re.S是任意匹配模式，也就是.可以匹配换行符    
 33         myItems = re.findall(‘<div class="content">.*?</div>‘,unicodePage,re.S)    
 34         items = []    
 35         for item in myItems:    
 36             #去掉段子中网页相关的内容
 37             strinfo = re.compile(u‘<.*?>‘)
 38             tt = strinfo.sub(u‘‘, item)
 39 
 40             #strinfo1 = re.compile(u‘^\n*‘)
 41             #tt = strinfo1.sub(u‘‘, tt)
 42 
 43             #strinfo2 = re.compile(u‘\n*$‘)
 44             #tt = strinfo2.sub(u‘‘, tt)
 45             tt = tt.replace(u‘\n‘,u‘‘)
 46             
 47             
 48             items.append(tt)    
 49         return items    
 50     
 51     # 用于加载新的段子    
 52     def LoadPage(self):    
 53         # 如果用户未输入quit则一直运行    
 54         while self.enable:    
 55             # 如果pages数组中的内容小于2个    
 56             if len(self.pages) < 2:    
 57                 try:    
 58                     # 获取新的页面中的段子们    
 59                     myPage = self.GetPage(str(self.page))    
 60                     self.page += 1    
 61                     self.pages.append(myPage)    
 62                 except:    
 63                     print ‘无法链接糗事百科！‘    
 64             else:    
 65                 time.sleep(1)    
 66             
 67     def ShowPage(self,nowPage,page):    
 68         for items in nowPage:    
 69             print u‘第%d条\n‘ % self.count , items
 70             self.count += 1
 71             myInput = raw_input()    
 72             if myInput == "q":    
 73                 self.enable = False    
 74                 break    
 75             
 76     def Start(self):    
 77         self.enable = True    
 78         page = self.page    
 79     
 80         print u‘......正在搜索中......\n‘    
 81             
 82         # 新建一个线程在后台加载段子并存储    
 83         thread.start_new_thread(self.LoadPage,())    
 84             
 85         #----------- 加载处理糗事百科 -----------    
 86         while self.enable:    
 87             # 如果self的page数组中存有元素    
 88             if self.pages:    
 89                 nowPage = self.pages[0]    
 90                 del self.pages[0]    
 91                 self.ShowPage(nowPage,page)    
 92                 page += 1    
 93     
 94     
 95 #----------- 程序的入口处 -----------    
 96 print u"""  
 97 ---------------------------------------  
 98    程序：糗百爬虫  
 99    版本：1.0  
100    作者：zz  
101    日期：2016-02-16  
102    语言：Python 2.5  
103    操作：输入‘q‘退出阅读糗事百科  
104    功能：按下回车依次浏览今日的糗百热点  
105 ---------------------------------------  
106 """  
107     
108     
109 print u‘请按下回车浏览今日的糗百内容：‘    
110 raw_input(‘ ‘)    
111 myModel = Spider_Model()    
112 myModel.Start()

View Code

运行效果如下：

技术分享

Python爬虫与mysql

标签：

原文地址：http://www.cnblogs.com/lanyuan114489/p/5194751.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行