标签:
参考了此处,做了修改,代码如下:
1 #coding:utf-8 2 import urllib2 3 import urllib 4 import re 5 import sys 6 import os 7 import time 8 9 10 class Yinyuetai(): 11 12 #地址初始化 13 def __init__(self, url): 14 self.i = 1 15 self.url = url 16 self.headers = { 17 ‘User-Agent‘:‘Mozilla/5.0 (Windows NT 6.3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36‘, 18 ‘Accept‘:‘text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8‘ 19 } 20 self.timeout = 30 21 self.__init() 22 23 24 def __init(self, page=1): 25 print u"开始下载:第 %d 页 ..." % page 26 reurl = self.url + "&page=%d" %page 27 page = self.getPage(reurl) 28 mvPageList = self.__getMvPageList(page) 29 if len(mvPageList) > 0: 30 for plist in mvPageList: 31 mvlist = self.getMvURL(plist) 32 self.downLoad(mvlist[0], mvlist[1].decode("utf-8")) 33 self.i += 1 34 time.sleep(2) 35 page += 1 36 self.__init(page) 37 else: 38 print u"\n~~~~~~~~~~~完成!~~~~~~~~~~~~~~" 39 40 41 42 #获取指定页面源码 43 def getPage(self, url): 44 try: 45 request = urllib2.Request(url, None, self.headers) 46 response = urllib2.urlopen(request, None, self.timeout) 47 return response.read() 48 except: 49 return [] 50 51 #分析列表页,返回MV地址和名字列表[0]:视频ID[1]:视频名称 52 def __getMvPageList(self, page): 53 reg = r"<h3><a\shref=\"http:\/\/v.yinyuetai.com\/video\/([0-9]+)\".*title=\"(.*)\".*" 54 pattern = re.compile(reg) 55 findList = re.findall(pattern, page) 56 return findList 57 #print findList 58 59 60 61 def getMvURL(self, mvlist): 62 url = "http://www.yinyuetai.com/insite/get-video-info?flex=true&videoId=%d" % int(mvlist[0]) 63 html = self.getPage(url) 64 65 reg = r"http://\w*?\.yinyuetai\.com/uploads/videos/common/.*?(?=&br)" 66 pattern=re.compile(reg) 67 findList = re.findall(pattern, html) 68 69 if len(findList) >= 3: 70 return [findList[2], mvlist[1]] 71 else: 72 return [findList[0], mvlist[1]] 73 74 75 #end def 76 77 #下载文件 78 def downLoad(self, url, name): 79 name = name + ‘.flv‘ 80 print u"下载:[%s] [%d]" % (name, self.i) 81 local = self.makeDirs() + ‘/‘ + name 82 try: 83 urllib.urlretrieve(url, local, self.schedule) 84 print u"下载完成:[%s]\n" % name 85 except: 86 print u"下载失败!\n" 87 88 def makeDirs(self): 89 path = sys.path[0] 90 newPath = os.path.join(path, ‘flv‘) 91 if not os.path.isdir(newPath): 92 os.mkdir(newPath) 93 return newPath 94 95 """ 96 回调函数获取进度 97 @ a 已经下载的数据块 98 @ b 数据块的大小 99 @ c 远程文件的大小 100 """ 101 def schedule(self, a, b, c): 102 per = 100.0 *a * b / c 103 if per > 100 : per = 100 104 sys.stdout.write(u" 进度:%.1f%%\r" % per) 105 sys.stdout.flush() 106 107 108 109 110 if __name__ == ‘__main__‘: 111 url = ‘http://mv.yinyuetai.com/all?pageType=page&sort=weekViews&tab=allmv&parenttab=mv‘ 112 Yinyuetai(url)
标签:
原文地址:http://www.cnblogs.com/nju2014/p/4471296.html