标签:
未完待续!
1 #coding: utf-8 2 import re 3 import urllib2 4 from bs4 import BeautifulSoup 5 import time 6 import xlwt 7 import sys 8 reload(sys) 9 sys.setdefaultencoding(‘utf8‘) 10 11 NUM = 0 #全局变量。电影数量 12 m_type = u‘‘ #全局变量。电影类型 13 m_site = u‘qq‘ #全局变量。电影网站 14 15 #根据指定的URL获取网页内容 16 def getHtml(url): 17 headers = { 18 ‘User-Agent‘:‘Mozilla/5.0 (Windows NT 6.3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36‘, 19 ‘Accept‘:‘text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8‘} 20 timeout = 30 21 req = urllib2.Request(url, None, headers) 22 response = urllib2.urlopen(req, None, timeout) 23 return response.read() 24 25 #从电影分类列表页面获取电影分类 26 def getTags(html): 27 global m_type 28 soup = BeautifulSoup(html) 29 #return soup 30 tags_all = soup.find_all(‘ul‘, {‘class‘: ‘clearfix _group‘, ‘gname‘: ‘mi_type‘}) 31 #print len(tags_all), tags_all 32 #print str(tags_all[0]).replace(‘\n‘, ‘‘) 33 #<a _hot="tag.sub" class="_gtag _hotkey" href="http://v.qq.com/list/1_0_-1_-1_1_0_0_20_0_-1_0.html" title="动作" tvalue="0">动作</a> 34 reTags = r‘<a _hot=\"tag\.sub\" class=\"_gtag _hotkey\" href=\"(.+?)\" title=\"(.+?)\" tvalue=\"(.+?)\">.+?</a>‘ 35 pattern = re.compile(reTags, re.DOTALL) 36 37 tags = pattern.findall(str(tags_all[0])) 38 if tags: 39 tagsURL = {} 40 for tag in tags: 41 #print tag 42 tagURL = tag[0].decode(‘utf-8‘) 43 m_type = tag[1].decode(‘utf-8‘) 44 tagsURL[m_type] = tagURL 45 46 else: 47 print "Not Find" 48 return tagsURL 49 50 #获取每个分类的页数 51 def getPages(tagUrl): 52 tag_html = getHtml(tagUrl) 53 #div class="paginator 54 soup = BeautifulSoup(tag_html) #过滤出标记页面的html 55 #print soup 56 #<div class="mod_pagenav" id="pager"> 57 div_page = soup.find_all(‘div‘, {‘class‘ : ‘mod_pagenav‘, ‘id‘ : ‘pager‘}) 58 #print div_page[0] 59 60 #<a _hot="movie.page2." class="c_txt6" href="http://v.qq.com/list/1_18_-1_-1_1_0_1_20_0_-1_0_-1.html" title="2"><span>2</span></a> 61 re_pages = r‘<a _hot=.+?><span>(.+?)</span></a>‘ 62 p = re.compile(re_pages, re.DOTALL) 63 pages = p.findall(str(div_page[0])) 64 #print pages 65 if len(pages) > 1: 66 return pages[-2] 67 else: 68 return 1 69 70 #获取电影列表 71 def getMovieList(html): 72 soup = BeautifulSoup(html) 73 #<ul class="mod_list_pic_130"> 74 divs = soup.find_all(‘ul‘, {‘class‘: ‘mod_list_pic_130‘}) 75 #print divs 76 for divHtml in divs: 77 divHtml = str(divHtml).replace(‘\n‘, ‘‘) 78 #print divHtml 79 getMovie(divHtml) 80 81 82 def getMovie(html): 83 global NUM 84 global m_type 85 global m_site 86 87 reMovie = r‘<li><a _hot=\"movie\.image\.link\.1\.\" class=\"mod_poster_130\" href=\"(.+?)\" target=\"_blank\" title=\"(.+?)\">.+?</li>‘ 88 p = re.compile(reMovie, re.DOTALL) 89 movies = p.findall(html) 90 #print movies 91 if movies: 92 93 for movie in movies: 94 #print movie 95 NUM += 1 96 print "%s : %d" % ("=" * 70, NUM) 97 values = dict( 98 movieTitle=movie[1], 99 movieUrl=movie[0], 100 movieSite=m_site, 101 movieType=m_type 102 ) 103 print values 104 table.write(NUM, 0, NUM) 105 table.write(NUM, 1, values[‘movieTitle‘]) 106 table.write(NUM, 2, values[‘movieUrl‘]) 107 table.write(NUM, 3, values[‘movieSite‘]) 108 table.write(NUM, 4, values[‘movieType‘]) 109 110 111 112 113 114 115 if __name__ == "__main__": 116 url = "http://v.qq.com/list/1_-1_-1_-1_1_0_0_20_0_-1_0.html" 117 html = getHtml(url) 118 tagUrls = getTags(html) 119 #print tagHtml 120 #print tagUrls 121 file = xlwt.Workbook() 122 table = file.add_sheet(‘movies‘) 123 table.write(0, 0, ‘number‘) 124 table.write(0, 1, ‘movieTitle‘) 125 table.write(0, 2, ‘movieUrl‘) 126 table.write(0, 3, ‘movieSite‘) 127 table.write(0, 4, ‘movieType‘) 128 129 130 for url in tagUrls.items(): 131 print str(url[1]).encode(‘utf-8‘), url[0] 132 #getPages(str(url[1])) 133 maxPage = int(getPages(str(url[1]).encode(‘utf-8‘))) 134 print maxPage 135 136 for x in range(0, maxPage): 137 #http://v.qq.com/list/1_18_-1_-1_1_0_0_20_0_-1_0_-1.html 138 m_url = str(url[1]).replace(‘0_20_0_-1_0_-1.html‘, ‘‘) 139 #print m_url 140 movie_url = "%s%d_20_0_-1_0_-1.html" % (m_url, x) 141 #print movie_url 142 movie_html = getHtml(movie_url.encode(‘utf-8‘)) 143 #print movie_html 144 getMovieList(movie_html) 145 time.sleep(10) 146 147 file.save(‘demo.xls‘)
标签:
原文地址:http://www.cnblogs.com/nju2014/p/4606036.html