码迷,mamicode.com
首页 > 其他好文 > 详细

爬腾讯视频所有类型的电影

时间:2015-06-28 21:21:14      阅读:195      评论:0      收藏:0      [点我收藏+]

标签:

未完待续!

  1 #coding: utf-8
  2 import re
  3 import urllib2
  4 from bs4 import BeautifulSoup
  5 import time
  6 import xlwt
  7 import sys
  8 reload(sys)
  9 sys.setdefaultencoding(utf8)
 10 
 11 NUM = 0         #全局变量。电影数量
 12 m_type = u‘‘    #全局变量。电影类型
 13 m_site = uqq  #全局变量。电影网站
 14 
 15 #根据指定的URL获取网页内容
 16 def getHtml(url):
 17     headers = {
 18             User-Agent:Mozilla/5.0 (Windows NT 6.3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36,
 19             Accept:text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8}
 20     timeout = 30
 21     req = urllib2.Request(url, None, headers)
 22     response = urllib2.urlopen(req, None, timeout)
 23     return response.read()
 24 
 25 #从电影分类列表页面获取电影分类
 26 def getTags(html):
 27     global m_type
 28     soup = BeautifulSoup(html)
 29     #return soup
 30     tags_all = soup.find_all(ul, {class: clearfix _group, gname: mi_type})
 31     #print len(tags_all), tags_all
 32     #print str(tags_all[0]).replace(‘\n‘, ‘‘)
 33     #<a _hot="tag.sub" class="_gtag _hotkey" href="http://v.qq.com/list/1_0_-1_-1_1_0_0_20_0_-1_0.html" title="动作" tvalue="0">动作</a>
 34     reTags = r<a _hot=\"tag\.sub\" class=\"_gtag _hotkey\" href=\"(.+?)\" title=\"(.+?)\" tvalue=\"(.+?)\">.+?</a>
 35     pattern = re.compile(reTags, re.DOTALL)
 36 
 37     tags = pattern.findall(str(tags_all[0]))
 38     if tags:
 39         tagsURL = {}
 40         for tag in tags:
 41             #print tag
 42             tagURL = tag[0].decode(utf-8)
 43             m_type = tag[1].decode(utf-8)
 44             tagsURL[m_type] = tagURL
 45 
 46     else:
 47         print "Not Find"
 48     return tagsURL
 49 
 50 #获取每个分类的页数
 51 def getPages(tagUrl):
 52     tag_html = getHtml(tagUrl)
 53     #div class="paginator
 54     soup = BeautifulSoup(tag_html)      #过滤出标记页面的html
 55     #print soup
 56     #<div class="mod_pagenav" id="pager">
 57     div_page = soup.find_all(div, {class : mod_pagenav, id : pager})
 58     #print div_page[0]
 59 
 60     #<a _hot="movie.page2." class="c_txt6" href="http://v.qq.com/list/1_18_-1_-1_1_0_1_20_0_-1_0_-1.html" title="2"><span>2</span></a>
 61     re_pages = r<a _hot=.+?><span>(.+?)</span></a>
 62     p = re.compile(re_pages, re.DOTALL)
 63     pages = p.findall(str(div_page[0]))
 64     #print pages
 65     if len(pages) > 1:
 66         return pages[-2]
 67     else:
 68         return 1
 69 
 70 #获取电影列表
 71 def getMovieList(html):
 72     soup = BeautifulSoup(html)
 73     #<ul class="mod_list_pic_130">
 74     divs = soup.find_all(ul, {class: mod_list_pic_130})
 75     #print divs
 76     for divHtml in divs:
 77         divHtml = str(divHtml).replace(\n, ‘‘)
 78         #print divHtml
 79         getMovie(divHtml)
 80 
 81 
 82 def getMovie(html):
 83     global NUM
 84     global m_type
 85     global m_site
 86 
 87     reMovie = r<li><a _hot=\"movie\.image\.link\.1\.\" class=\"mod_poster_130\" href=\"(.+?)\" target=\"_blank\" title=\"(.+?)\">.+?</li>
 88     p = re.compile(reMovie, re.DOTALL)
 89     movies = p.findall(html)
 90     #print movies
 91     if movies:
 92 
 93         for movie in movies:
 94             #print movie
 95             NUM += 1
 96             print "%s : %d" % ("=" * 70, NUM)
 97             values = dict(
 98                 movieTitle=movie[1],
 99                 movieUrl=movie[0],
100                 movieSite=m_site,
101                 movieType=m_type
102             )
103             print values
104             table.write(NUM, 0, NUM)
105             table.write(NUM, 1, values[movieTitle])
106             table.write(NUM, 2, values[movieUrl])
107             table.write(NUM, 3, values[movieSite])
108             table.write(NUM, 4, values[movieType])
109 
110 
111 
112 
113 
114 
115 if __name__ == "__main__":
116     url = "http://v.qq.com/list/1_-1_-1_-1_1_0_0_20_0_-1_0.html"
117     html = getHtml(url)
118     tagUrls = getTags(html)
119     #print tagHtml
120     #print tagUrls
121     file = xlwt.Workbook()
122     table = file.add_sheet(movies)
123     table.write(0, 0, number)
124     table.write(0, 1, movieTitle)
125     table.write(0, 2, movieUrl)
126     table.write(0, 3, movieSite)
127     table.write(0, 4, movieType)
128 
129 
130     for url in tagUrls.items():
131         print str(url[1]).encode(utf-8), url[0]
132         #getPages(str(url[1]))
133         maxPage = int(getPages(str(url[1]).encode(utf-8)))
134         print maxPage
135 
136         for x in range(0, maxPage):
137             #http://v.qq.com/list/1_18_-1_-1_1_0_0_20_0_-1_0_-1.html
138             m_url = str(url[1]).replace(0_20_0_-1_0_-1.html, ‘‘)
139             #print m_url
140             movie_url = "%s%d_20_0_-1_0_-1.html" % (m_url, x)
141             #print movie_url
142             movie_html = getHtml(movie_url.encode(utf-8))
143             #print movie_html
144             getMovieList(movie_html)
145             time.sleep(10)
146 
147         file.save(demo.xls)

 

爬腾讯视频所有类型的电影

标签:

原文地址:http://www.cnblogs.com/nju2014/p/4606036.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!