标签:desktop gen gif ini actor 标题 windows item 编号
待采集的网站如下:
采集的内容未该站点下的“音乐期刊”(#为对应的期刊序数,为正整数)
http://www.luoo.net/music/#
原创爬虫的作者的github地址:
https://github.com/imchenkun/ick-spider/blob/master/luoospider.py
具体参见这篇博文:
http://www.cnblogs.com/chenkun/p/5653459.html
网上牛人给出的代码:
1 #-*- coding: utf-8 -*- 2 import os 3 import requests 4 from bs4 import BeautifulSoup 5 import random 6 from faker import Factory 7 import Queue 8 import threading 9 10 fake = Factory.create() 11 12 luoo_site = ‘http://www.luoo.net/music/‘ 13 luoo_site_mp3 = ‘http://luoo-mp3.kssws.ks-cdn.com/low/luoo/radio%s/%s.mp3‘ 14 15 proxy_ips = [ 16 ‘183.129.151.130‘ # 这里配置可用的代理IP 17 ] 18 19 headers = { 20 ‘Connection‘: ‘keep-alive‘, 21 ‘User-Agent‘: fake.user_agent() 22 } 23 24 25 def random_proxies(): 26 ip_index = random.randint(0, len(proxy_ips)-1) 27 res = { ‘http‘: proxy_ips[ip_index] } 28 return res 29 30 def fix_characters(s): 31 for c in [‘<‘, ‘>‘, ‘:‘, ‘"‘, ‘/‘, ‘\\‘, ‘|‘, ‘?‘, ‘*‘]: 32 s = s.replace(c, ‘‘) 33 return s 34 35 36 class LuooSpider(threading.Thread): 37 def __init__(self, url, vols, queue=None): 38 threading.Thread.__init__(self) 39 print ‘[luoo spider]‘ 40 print ‘=‘ * 20 41 42 self.url = url 43 self.queue = queue 44 self.vol = ‘1‘ 45 self.vols = vols 46 47 def run(self): 48 for vol in self.vols: 49 self.spider(vol) 50 print ‘\ncrawl end\n\n‘ 51 52 def spider(self, vol): 53 url = luoo_site + vol 54 print ‘crawling: ‘ + url + ‘\n‘ 55 res = requests.get(url, proxies=random_proxies()) 56 57 soup = BeautifulSoup(res.content, ‘html.parser‘) 58 title = soup.find(‘span‘, attrs={‘class‘: ‘vol-title‘}).text 59 cover = soup.find(‘img‘, attrs={‘class‘: ‘vol-cover‘})[‘src‘] 60 desc = soup.find(‘div‘, attrs={‘class‘: ‘vol-desc‘}) 61 track_names = soup.find_all(‘a‘, attrs={‘class‘: ‘trackname‘}) 62 track_count = len(track_names) 63 tracks = [] 64 for track in track_names: 65 _id = str(int(track.text[:2])) if (int(vol) < 12) else track.text[:2] # 12期前的音乐编号1~9是1位(如:1~9),之后的都是2位 1~9会在左边垫0(如:01~09) 66 _name = fix_characters(track.text[4:]) 67 tracks.append({‘id‘: _id, ‘name‘: _name}) 68 69 phases = { 70 ‘phase‘: vol, # 期刊编号 71 ‘title‘: title, # 期刊标题 72 ‘cover‘: cover, # 期刊封面 73 ‘desc‘: desc, # 期刊描述 74 ‘track_count‘: track_count, # 节目数 75 ‘tracks‘: tracks # 节目清单(节目编号,节目名称) 76 } 77 78 self.queue.put(phases) 79 80 81 class LuooDownloader(threading.Thread): 82 def __init__(self, url, dist, queue=None): 83 threading.Thread.__init__(self) 84 self.url = url 85 self.queue = queue 86 self.dist = dist 87 self.__counter = 0 88 89 def run(self): 90 while True: 91 if self.queue.qsize() <= 0: 92 pass 93 else: 94 phases = self.queue.get() 95 self.download(phases) 96 97 def download(self, phases): 98 for track in phases[‘tracks‘]: 99 file_url = self.url % (phases[‘phase‘], track[‘id‘]) 100 101 local_file_dict = ‘%s/%s‘ % (self.dist, phases[‘phase‘]) 102 if not os.path.exists(local_file_dict): 103 os.makedirs(local_file_dict) 104 105 local_file = ‘%s/%s.%s.mp3‘ % (local_file_dict, track[‘id‘], track[‘name‘]) 106 if not os.path.isfile(local_file): 107 print ‘downloading: ‘ + track[‘name‘] 108 res = requests.get(file_url, proxies=random_proxies(), headers=headers) 109 with open(local_file, ‘wb‘) as f: 110 f.write(res.content) 111 f.close() 112 print ‘done.\n‘ 113 else: 114 print ‘break: ‘ + track[‘name‘] 115 116 117 if __name__ == ‘__main__‘: 118 spider_queue = Queue.Queue() 119 120 luoo = LuooSpider(luoo_site, vols=[‘680‘, ‘721‘, ‘725‘, ‘720‘], queue=spider_queue) 121 luoo.setDaemon(True) 122 luoo.start() 123 124 downloader_count = 5 125 for i in range(downloader_count): 126 luoo_download = LuooDownloader(luoo_site_mp3, ‘D:/luoo‘, queue=spider_queue) 127 luoo_download.setDaemon(True) 128 luoo_download.start()
我给出的代码:
1 import os 2 import random 3 import threading 4 import winreg 5 from queue import Queue 6 from bs4 import BeautifulSoup 7 import requests 8 9 10 luoo_site = ‘http://www.luoo.net/music/‘ 11 luoo_site_mp3 = ‘http://mp3-cdn.luoo.net/low/luoo/radio%s/%s.mp3‘ 12 user_agents = ( 13 "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/4.0.249.0 Safari/532.5", 14 "Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.9 (KHTML, like Gecko) Chrome/5.0.310.0 Safari/532.9", 15 "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/7.0.514.0 Safari/534.7", 16 "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/9.0.601.0 Safari/534.14", 17 "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/10.0.601.0 Safari/534.14", 18 "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.20 (KHTML, like Gecko) Chrome/11.0.672.2 Safari/534.20", 19 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.27 (KHTML, like Gecko) Chrome/12.0.712.0 Safari/534.27", 20 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.24 Safari/535.1", 21 ) 22 headers = { 23 ‘Host‘: ‘mp3-cdn.luoo.net‘, 24 ‘Accept‘: ‘audio/webm,audio/ogg,audio/wav,audio/*;q=0.9,application/ogg;q=0.7,video/*;q=0.6,*/*;q=0.5‘, 25 ‘Accept-Language‘: ‘zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3‘, 26 ‘Referer‘: ‘http://www.luoo.net/music/896‘, 27 ‘Range‘: ‘bytes=0-‘, 28 ‘Connection‘: ‘keep-alive‘, 29 } 30 31 32 def get_desktop(): 33 key = winreg.OpenKey(winreg.HKEY_CURRENT_USER, 34 r‘Software\Microsoft\Windows\CurrentVersion\Explorer\Shell Folders‘,) 35 return winreg.QueryValueEx(key, "Desktop")[0] 36 37 def fix_characters(s): 38 for c in (‘<‘, ‘>‘, ‘:‘, ‘"‘, ‘/‘, ‘\\‘, ‘|‘, ‘?‘, ‘*‘): 39 s = s.replace(c, ‘‘) 40 return s 41 42 def fix_order(order): 43 fix_order = str(order) 44 if order < 10: 45 fix_order = "0" + fix_order 46 return fix_order 47 48 49 class LuooSpider(threading.Thread): 50 def __init__(self, url, vols, queue=None): 51 threading.Thread.__init__(self) 52 print( ‘[luoo spider]‘) 53 print( ‘=‘ * 20) 54 self.queue = queue 55 self.url = url 56 self.vols = vols 57 58 def run(self): 59 for vol in self.vols: 60 self.spider(vol) 61 print( ‘crawl end‘) 62 63 def spider(self, vol): 64 url = luoo_site + str(vol) 65 print( ‘?crawling: ‘ + url) 66 res = requests.get(url) 67 soup = BeautifulSoup(res.content.decode(‘utf-8‘), ‘html.parser‘) 68 try: 69 title = soup.find(‘span‘, attrs={‘class‘: ‘vol-title‘}).text 70 except: 71 print(‘Looks like nothing to do here?!‘) 72 return 73 cover = soup.find(‘img‘, attrs={‘class‘: ‘vol-cover‘})[‘src‘] 74 desc = soup.find(‘div‘, attrs={‘class‘: ‘vol-desc‘}).text 75 author = soup.find(‘a‘, attrs={‘class‘: ‘vol-author‘}).text 76 date = soup.find(‘span‘, attrs={‘class‘: ‘vol-date‘}).text 77 track_infos = soup.find_all(‘li‘, attrs={‘class‘: ‘track-item rounded‘}) 78 track_count = len(track_infos) 79 order = 1 80 tracks = [] 81 for track in track_infos: 82 a_track = {} 83 a_track[‘_order‘] = fix_order(order) 84 a_track[‘_id‘] = track[‘id‘][5:] 85 a_track[‘_cover‘] = track.find(‘img‘, attrs={‘class‘: ‘cover rounded‘})[‘src‘] 86 a_track[‘_name‘] = track.find(‘p‘, attrs={‘class‘: ‘name‘}).text 87 a_track[‘_artist‘] = track.find(‘p‘, attrs={‘class‘: ‘artist‘}).text[8:] 88 a_track[‘_album‘] = track.find(‘p‘, attrs={‘class‘: ‘album‘}).text[7:] 89 tracks.append(a_track) 90 order += 1 91 vols = { 92 ‘vol_num‘: vol, 93 ‘vol_title‘: title, 94 ‘vol_cover‘: cover, 95 ‘vol_desc‘: desc, 96 ‘vol_author‘: author, 97 ‘vol_date‘: date, 98 ‘track_count‘: track_count, 99 ‘tracks‘: tracks 100 } 101 self.queue.put(vols) 102 103 104 class LuooDownloader(threading.Thread): 105 def __init__(self, url, dist, queue=None): 106 threading.Thread.__init__(self) 107 self.url = url 108 self.queue = queue 109 self.dist = dist 110 self.__counter = 0 111 112 def run(self): 113 while True: 114 if self.queue.qsize(): 115 phases = self.queue.get() 116 self.download(phases) 117 118 def download(self, phases): 119 for track in phases[‘tracks‘]: 120 file_url = self.url % (phases[‘vol_num‘], track[‘_order‘]) 121 local_file_dict = ‘%s/%s‘ % (self.dist, phases[‘vol_num‘]) 122 if not os.path.exists(local_file_dict): 123 os.makedirs(local_file_dict) 124 local_file = ‘%s/%s.%s.mp3‘ % (local_file_dict, track[‘_order‘], track[‘_name‘]) 125 print( ‘?processing: ‘ + track[‘_name‘]) 126 if not os.path.isfile(local_file): 127 print( ‘?downloading: ‘ + track[‘_name‘]) 128 res = requests.get(file_url, headers=headers).content 129 if len(res) < 280: 130 file_url = self.url % (phases[‘vol_num‘], str(int(track[‘_order‘]))) 131 res = requests.get(file_url, headers=headers).content 132 with open(local_file, ‘wb‘) as f: 133 f.write(res) 134 print(‘?completed: ‘ + track[‘_name‘]) 135 else: 136 print( ‘?skipped: ‘ + track[‘_name‘]) 137 138 139 if __name__ == ‘__main__‘: 140 vol_queue = Queue() 141 luoo = LuooSpider(luoo_site, vols=range(1,1000), queue=vol_queue) 142 luoo.start() 143 144 downloader_count = 10 145 for i in range(downloader_count): 146 headers[‘User-Agent‘] = random.choice(user_agents) 147 luoo_download = LuooDownloader(luoo_site_mp3, get_desktop()+‘/luoo‘, queue=vol_queue) 148 luoo_download.start()
标签:desktop gen gif ini actor 标题 windows item 编号
原文地址:http://www.cnblogs.com/mixisila/p/6549567.html