爬腾讯视频所有类型的电影

时间：2015-06-28 21:21:14 阅读：195 评论：0 收藏：0 [点我收藏+]
标签：
未完待续！
  1 #coding: utf-8
  2 import re
  3 import urllib2
  4 from bs4 import BeautifulSoup
  5 import time
  6 import xlwt
  7 import sys
  8 reload(sys)
  9 sys.setdefaultencoding(‘utf8‘)
 10 
 11 NUM = 0         #全局变量。电影数量
 12 m_type = u‘‘    #全局变量。电影类型
 13 m_site = u‘qq‘  #全局变量。电影网站
 14 
 15 #根据指定的URL获取网页内容
 16 def getHtml(url):
 17     headers = {
 18             ‘User-Agent‘:‘Mozilla/5.0 (Windows NT 6.3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36‘,
 19             ‘Accept‘:‘text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8‘}
 20     timeout = 30
 21     req = urllib2.Request(url, None, headers)
 22     response = urllib2.urlopen(req, None, timeout)
 23     return response.read()
 24 
 25 #从电影分类列表页面获取电影分类
 26 def getTags(html):
 27     global m_type
 28     soup = BeautifulSoup(html)
 29     #return soup
 30     tags_all = soup.find_all(‘ul‘, {‘class‘: ‘clearfix _group‘, ‘gname‘: ‘mi_type‘})
 31     #print len(tags_all), tags_all
 32     #print str(tags_all[0]).replace(‘\n‘, ‘‘)
 33     #<a _hot="tag.sub" class="_gtag _hotkey" href="http://v.qq.com/list/1_0_-1_-1_1_0_0_20_0_-1_0.html" title="动作" tvalue="0">动作</a>
 34     reTags = r‘<a _hot=\"tag\.sub\" class=\"_gtag _hotkey\" href=\"(.+?)\" title=\"(.+?)\" tvalue=\"(.+?)\">.+?</a>‘
 35     pattern = re.compile(reTags, re.DOTALL)
 36 
 37     tags = pattern.findall(str(tags_all[0]))
 38     if tags:
 39         tagsURL = {}
 40         for tag in tags:
 41             #print tag
 42             tagURL = tag[0].decode(‘utf-8‘)
 43             m_type = tag[1].decode(‘utf-8‘)
 44             tagsURL[m_type] = tagURL
 45 
 46     else:
 47         print "Not Find"
 48     return tagsURL
 49 
 50 #获取每个分类的页数
 51 def getPages(tagUrl):
 52     tag_html = getHtml(tagUrl)
 53     #div class="paginator
 54     soup = BeautifulSoup(tag_html)      #过滤出标记页面的html
 55     #print soup
 56     #<div class="mod_pagenav" id="pager">
 57     div_page = soup.find_all(‘div‘, {‘class‘ : ‘mod_pagenav‘, ‘id‘ : ‘pager‘})
 58     #print div_page[0]
 59 
 60     #<a _hot="movie.page2." class="c_txt6" href="http://v.qq.com/list/1_18_-1_-1_1_0_1_20_0_-1_0_-1.html" title="2"><span>2</span></a>
 61     re_pages = r‘<a _hot=.+?><span>(.+?)</span></a>‘
 62     p = re.compile(re_pages, re.DOTALL)
 63     pages = p.findall(str(div_page[0]))
 64     #print pages
 65     if len(pages) > 1:
 66         return pages[-2]
 67     else:
 68         return 1
 69 
 70 #获取电影列表
 71 def getMovieList(html):
 72     soup = BeautifulSoup(html)
 73     #<ul class="mod_list_pic_130">
 74     divs = soup.find_all(‘ul‘, {‘class‘: ‘mod_list_pic_130‘})
 75     #print divs
 76     for divHtml in divs:
 77         divHtml = str(divHtml).replace(‘\n‘, ‘‘)
 78         #print divHtml
 79         getMovie(divHtml)
 80 
 81 
 82 def getMovie(html):
 83     global NUM
 84     global m_type
 85     global m_site
 86 
 87     reMovie = r‘<li><a _hot=\"movie\.image\.link\.1\.\" class=\"mod_poster_130\" href=\"(.+?)\" target=\"_blank\" title=\"(.+?)\">.+?</li>‘
 88     p = re.compile(reMovie, re.DOTALL)
 89     movies = p.findall(html)
 90     #print movies
 91     if movies:
 92 
 93         for movie in movies:
 94             #print movie
 95             NUM += 1
 96             print "%s : %d" % ("=" * 70, NUM)
 97             values = dict(
 98                 movieTitle=movie[1],
 99                 movieUrl=movie[0],
100                 movieSite=m_site,
101                 movieType=m_type
102             )
103             print values
104             table.write(NUM, 0, NUM)
105             table.write(NUM, 1, values[‘movieTitle‘])
106             table.write(NUM, 2, values[‘movieUrl‘])
107             table.write(NUM, 3, values[‘movieSite‘])
108             table.write(NUM, 4, values[‘movieType‘])
109 
110 
111 
112 
113 
114 
115 if __name__ == "__main__":
116     url = "http://v.qq.com/list/1_-1_-1_-1_1_0_0_20_0_-1_0.html"
117     html = getHtml(url)
118     tagUrls = getTags(html)
119     #print tagHtml
120     #print tagUrls
121     file = xlwt.Workbook()
122     table = file.add_sheet(‘movies‘)
123     table.write(0, 0, ‘number‘)
124     table.write(0, 1, ‘movieTitle‘)
125     table.write(0, 2, ‘movieUrl‘)
126     table.write(0, 3, ‘movieSite‘)
127     table.write(0, 4, ‘movieType‘)
128 
129 
130     for url in tagUrls.items():
131         print str(url[1]).encode(‘utf-8‘), url[0]
132         #getPages(str(url[1]))
133         maxPage = int(getPages(str(url[1]).encode(‘utf-8‘)))
134         print maxPage
135 
136         for x in range(0, maxPage):
137             #http://v.qq.com/list/1_18_-1_-1_1_0_0_20_0_-1_0_-1.html
138             m_url = str(url[1]).replace(‘0_20_0_-1_0_-1.html‘, ‘‘)
139             #print m_url
140             movie_url = "%s%d_20_0_-1_0_-1.html" % (m_url, x)
141             #print movie_url
142             movie_html = getHtml(movie_url.encode(‘utf-8‘))
143             #print movie_html
144             getMovieList(movie_html)
145             time.sleep(10)
146 
147         file.save(‘demo.xls‘)
爬腾讯视频所有类型的电影
标签：
原文地址：http://www.cnblogs.com/nju2014/p/4606036.html
踩
(0)
评论一句话评论（0）
分享档案
更多>
2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)
周排行