码迷,mamicode.com
首页 > 编程语言 > 详细

Python多线程采集数据示例:落网_音乐期刊

时间:2017-03-14 17:46:41      阅读:456      评论:0      收藏:0      [点我收藏+]

标签:desktop   gen   gif   ini   actor   标题   windows   item   编号   

 待采集的网站如下:

http://www.luoo.net/

采集的内容未该站点下的“音乐期刊”(#为对应的期刊序数,为正整数)

http://www.luoo.net/music/#

原创爬虫的作者的github地址:

https://github.com/imchenkun/ick-spider/blob/master/luoospider.py

具体参见这篇博文:

http://www.cnblogs.com/chenkun/p/5653459.html

 

网上牛人给出的代码:

技术分享
  1 #-*- coding: utf-8 -*-
  2 import os
  3 import requests
  4 from bs4 import BeautifulSoup
  5 import random
  6 from faker import Factory
  7 import Queue
  8 import threading
  9 
 10 fake = Factory.create()
 11 
 12 luoo_site = http://www.luoo.net/music/
 13 luoo_site_mp3 = http://luoo-mp3.kssws.ks-cdn.com/low/luoo/radio%s/%s.mp3
 14 
 15 proxy_ips = [
 16     183.129.151.130 # 这里配置可用的代理IP
 17     ]
 18 
 19 headers = {
 20     Connection: keep-alive,
 21     User-Agent: fake.user_agent()
 22     }
 23 
 24 
 25 def random_proxies():
 26     ip_index = random.randint(0, len(proxy_ips)-1)
 27     res = { http: proxy_ips[ip_index] }
 28     return res
 29 
 30 def fix_characters(s):
 31     for c in [<, >, :, ", /, \\, |, ?, *]:
 32         s = s.replace(c, ‘‘)
 33     return s
 34 
 35 
 36 class LuooSpider(threading.Thread):
 37     def __init__(self, url, vols, queue=None):
 38         threading.Thread.__init__(self)
 39         print [luoo spider]
 40         print = * 20
 41 
 42         self.url = url
 43         self.queue = queue
 44         self.vol = 1
 45         self.vols = vols
 46 
 47     def run(self):
 48         for vol in self.vols:
 49             self.spider(vol)
 50         print \ncrawl end\n\n
 51     
 52     def spider(self, vol):
 53         url = luoo_site + vol
 54         print crawling:  + url + \n
 55         res = requests.get(url, proxies=random_proxies())
 56         
 57         soup = BeautifulSoup(res.content, html.parser)
 58         title = soup.find(span, attrs={class: vol-title}).text
 59         cover = soup.find(img, attrs={class: vol-cover})[src]
 60         desc = soup.find(div, attrs={class: vol-desc})
 61         track_names = soup.find_all(a, attrs={class: trackname})
 62         track_count = len(track_names)
 63         tracks = []
 64         for track in track_names:
 65             _id = str(int(track.text[:2])) if (int(vol) < 12) else track.text[:2]  # 12期前的音乐编号1~9是1位(如:1~9),之后的都是2位 1~9会在左边垫0(如:01~09)
 66             _name = fix_characters(track.text[4:])
 67             tracks.append({id: _id, name: _name})
 68 
 69         phases = {
 70             phase: vol,                        # 期刊编号
 71             title: title,                      # 期刊标题
 72             cover: cover,                      # 期刊封面
 73             desc: desc,                        # 期刊描述
 74             track_count: track_count,          # 节目数
 75             tracks: tracks                     # 节目清单(节目编号,节目名称)
 76             }
 77         
 78         self.queue.put(phases)
 79 
 80 
 81 class LuooDownloader(threading.Thread):
 82     def __init__(self, url, dist, queue=None):
 83         threading.Thread.__init__(self)
 84         self.url = url
 85         self.queue = queue
 86         self.dist = dist
 87         self.__counter = 0
 88         
 89     def run(self):
 90         while True:
 91             if self.queue.qsize() <= 0:
 92                 pass
 93             else:
 94                 phases = self.queue.get()
 95                 self.download(phases)
 96 
 97     def download(self, phases):
 98         for track in phases[tracks]:
 99             file_url = self.url % (phases[phase], track[id])
100 
101             local_file_dict = %s/%s % (self.dist, phases[phase])
102             if not os.path.exists(local_file_dict):
103                 os.makedirs(local_file_dict)
104             
105             local_file = %s/%s.%s.mp3 % (local_file_dict, track[id], track[name])
106             if not os.path.isfile(local_file):
107                 print downloading:  + track[name]
108                 res = requests.get(file_url, proxies=random_proxies(), headers=headers)
109                 with open(local_file, wb) as f:
110                     f.write(res.content)
111                     f.close()
112                 print done.\n
113             else:
114                 print break:  + track[name]
115 
116 
117 if __name__ == __main__:
118     spider_queue = Queue.Queue()
119 
120     luoo = LuooSpider(luoo_site, vols=[680, 721, 725, 720], queue=spider_queue)
121     luoo.setDaemon(True)
122     luoo.start()
123 
124     downloader_count = 5
125     for i in range(downloader_count):
126         luoo_download = LuooDownloader(luoo_site_mp3, D:/luoo, queue=spider_queue)
127         luoo_download.setDaemon(True)
128         luoo_download.start()
网上版本

我给出的代码:

技术分享
  1 import os
  2 import random
  3 import threading
  4 import winreg
  5 from queue import Queue
  6 from bs4 import BeautifulSoup
  7 import requests
  8 
  9 
 10 luoo_site = http://www.luoo.net/music/
 11 luoo_site_mp3 = http://mp3-cdn.luoo.net/low/luoo/radio%s/%s.mp3
 12 user_agents = (
 13     "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/4.0.249.0 Safari/532.5",
 14     "Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.9 (KHTML, like Gecko) Chrome/5.0.310.0 Safari/532.9",
 15     "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/7.0.514.0 Safari/534.7",
 16     "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/9.0.601.0 Safari/534.14",
 17     "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/10.0.601.0 Safari/534.14",
 18     "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.20 (KHTML, like Gecko) Chrome/11.0.672.2 Safari/534.20",
 19     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.27 (KHTML, like Gecko) Chrome/12.0.712.0 Safari/534.27",
 20     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.24 Safari/535.1",
 21     )
 22 headers = {
 23     Host: mp3-cdn.luoo.net,
 24     Accept: audio/webm,audio/ogg,audio/wav,audio/*;q=0.9,application/ogg;q=0.7,video/*;q=0.6,*/*;q=0.5,
 25     Accept-Language: zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3,
 26     Referer: http://www.luoo.net/music/896,
 27     Range: bytes=0-,
 28     Connection: keep-alive,
 29     }
 30 
 31 
 32 def get_desktop():
 33     key = winreg.OpenKey(winreg.HKEY_CURRENT_USER, 34         rSoftware\Microsoft\Windows\CurrentVersion\Explorer\Shell Folders,)
 35     return winreg.QueryValueEx(key, "Desktop")[0]
 36 
 37 def fix_characters(s):
 38     for c in (<, >, :, ", /, \\, |, ?, *):
 39         s = s.replace(c, ‘‘)
 40     return s
 41 
 42 def fix_order(order):
 43     fix_order = str(order)
 44     if order < 10:
 45         fix_order = "0" + fix_order
 46     return fix_order
 47 
 48 
 49 class LuooSpider(threading.Thread):
 50     def __init__(self, url, vols, queue=None):
 51         threading.Thread.__init__(self)
 52         print( [luoo spider])
 53         print( = * 20)
 54         self.queue = queue
 55         self.url = url
 56         self.vols = vols
 57 
 58     def run(self):
 59         for vol in self.vols:
 60             self.spider(vol)
 61         print( crawl end)
 62 
 63     def spider(self, vol):
 64         url = luoo_site + str(vol)
 65         print( ?crawling:  + url)
 66         res = requests.get(url)
 67         soup = BeautifulSoup(res.content.decode(utf-8), html.parser)
 68         try:
 69             title = soup.find(span, attrs={class: vol-title}).text
 70         except:
 71             print(Looks like nothing to do here?!)
 72             return
 73         cover = soup.find(img, attrs={class: vol-cover})[src]
 74         desc = soup.find(div, attrs={class: vol-desc}).text
 75         author = soup.find(a, attrs={class: vol-author}).text
 76         date = soup.find(span, attrs={class: vol-date}).text
 77         track_infos = soup.find_all(li, attrs={class: track-item rounded})
 78         track_count = len(track_infos)
 79         order = 1
 80         tracks = []
 81         for track in track_infos:
 82             a_track = {}
 83             a_track[_order] = fix_order(order)
 84             a_track[_id] = track[id][5:]
 85             a_track[_cover] = track.find(img, attrs={class: cover rounded})[src]
 86             a_track[_name] = track.find(p, attrs={class: name}).text
 87             a_track[_artist] = track.find(p, attrs={class: artist}).text[8:]
 88             a_track[_album] = track.find(p, attrs={class: album}).text[7:]
 89             tracks.append(a_track)
 90             order += 1
 91         vols = {
 92             vol_num: vol,
 93             vol_title: title,
 94             vol_cover: cover,
 95             vol_desc: desc,
 96             vol_author: author,
 97             vol_date: date,
 98             track_count: track_count,
 99             tracks: tracks
100             }
101         self.queue.put(vols)
102 
103 
104 class LuooDownloader(threading.Thread):
105     def __init__(self, url, dist, queue=None):
106         threading.Thread.__init__(self)
107         self.url = url
108         self.queue = queue
109         self.dist = dist
110         self.__counter = 0
111 
112     def run(self):
113         while True:
114             if self.queue.qsize():
115                 phases = self.queue.get()
116                 self.download(phases)
117 
118     def download(self, phases):
119         for track in phases[tracks]:
120             file_url = self.url % (phases[vol_num], track[_order])
121             local_file_dict = %s/%s % (self.dist, phases[vol_num])
122             if not os.path.exists(local_file_dict):
123                 os.makedirs(local_file_dict)
124             local_file = %s/%s.%s.mp3 % (local_file_dict, track[_order], track[_name])
125             print( ?processing:  + track[_name])
126             if not os.path.isfile(local_file):
127                 print( ?downloading:  + track[_name])
128                 res = requests.get(file_url, headers=headers).content
129                 if len(res) < 280:
130                     file_url = self.url % (phases[vol_num], str(int(track[_order])))
131                     res = requests.get(file_url, headers=headers).content
132                 with open(local_file, wb) as f:
133                     f.write(res)
134                 print(?completed:  + track[_name])
135             else:
136                 print( ?skipped:  + track[_name])
137 
138 
139 if __name__ == __main__:
140     vol_queue = Queue()
141     luoo = LuooSpider(luoo_site, vols=range(1,1000), queue=vol_queue)
142     luoo.start()
143 
144     downloader_count = 10
145     for i in range(downloader_count):
146         headers[User-Agent] = random.choice(user_agents)
147         luoo_download = LuooDownloader(luoo_site_mp3, get_desktop()+/luoo, queue=vol_queue)
148         luoo_download.start()
改进版本

 

Python多线程采集数据示例:落网_音乐期刊

标签:desktop   gen   gif   ini   actor   标题   windows   item   编号   

原文地址:http://www.cnblogs.com/mixisila/p/6549567.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!