网络爬虫(又被称为网页蜘蛛,网络机器人,在FOAF社区中间,更经常的称为网页追逐者),是一种按照一定的规则,自动地抓取万维网信息的程序或者脚本。另外一些不常使用的名字还有蚂蚁、自动索引、模拟程序或者蠕虫。via 百度百科网络爬虫
网络蜘蛛(Web spider)也叫网络爬虫(Web crawler)[1],蚂蚁(ant),自动检索工具(automatic indexer),或者(在FOAF软件概念中)网络疾走(WEB scutter),是一种“自动化浏览网络”的程序,或者说是一种网络机器人。它们被广泛用于互联网搜索引擎或其他类似网站,以获取或更新这些网站的内容和检索方式。它们可以自动采集所有其能够访问到的页面内容,以供搜索引擎做进一步处理(分检整理下载的页面),而使得用户能更快的检索到他们需要的信息。via 维基百科网络蜘蛛
这里我们通过分析一个网站[落网:] 对网站内容进行提取来进一步了解!
第一步 确定目的
第二步 分析页面结构
Requests( 用来发起请求
BeautifulSoup(bs4) 用来解析HTML结构并提取内容
#-*- coding: utf-8 -*- ‘‘‘by sudo rm -rf‘‘‘ import os import requests from bs4 import BeautifulSoup import random from faker import Factory import Queue import threading fake = Factory.create() luoo_site = ‘‘ luoo_site_mp3 = ‘‘ proxy_ips = [ ‘‘ ] # 替换自己的代理IP headers = { ‘Connection‘: ‘keep-alive‘, ‘User-Agent‘: fake.user_agent() } def random_proxies(): ip_index = random.randint(0, len(proxy_ips)-1) res = { ‘http‘: proxy_ips[ip_index] } return res def fix_characters(s): for c in [‘<‘, ‘>‘, ‘:‘, ‘"‘, ‘/‘, ‘\\\\‘, ‘|‘, ‘?‘, ‘*‘]: s = s.replace(c, ‘‘) return s class LuooSpider(threading.Thread): def __init__(self, url, vols, queue=None): threading.Thread.__init__(self) print ‘[luoo spider]‘ print ‘=‘ * 20 self.url = url self.queue = queue self.vol = ‘1‘ self.vols = vols def run(self): for vol in self.vols: self.spider(vol) print ‘\\ncrawl end\\n\\n‘ def spider(self, vol): url = luoo_site + vol print ‘crawling: ‘ + url + ‘\\n‘ res = requests.get(url, proxies=random_proxies()) soup = BeautifulSoup(res.content, ‘html.parser‘) title = soup.find(‘span‘, attrs={‘class‘: ‘vol-title‘}).text cover = soup.find(‘img‘, attrs={‘class‘: ‘vol-cover‘})[‘src‘] desc = soup.find(‘div‘, attrs={‘class‘: ‘vol-desc‘}) track_names = soup.find_all(‘a‘, attrs={‘class‘: ‘trackname‘}) track_count = len(track_names) tracks = [] for track in track_names: _id = str(int(track.text[:2])) if (int(vol) < 12) else track.text[:2] # 12期前的音乐编号1~9是1位(如:1~9),之后的都是2位 1~9会在左边垫0(如:01~09) _name = fix_characters(track.text[4:]) tracks.append({‘id‘: _id, ‘name‘: _name}) phases = { ‘phase‘: vol, # 期刊编号 ‘title‘: title, # 期刊标题 ‘cover‘: cover, # 期刊封面 ‘desc‘: desc, # 期刊描述 ‘track_count‘: track_count, # 节目数 ‘tracks‘: tracks # 节目清单(节目编号,节目名称) } self.queue.put(phases) class LuooDownloader(threading.Thread): def __init__(self, url, dist, queue=None): threading.Thread.__init__(self) self.url = url self.queue = queue self.dist = dist self.__counter = 0 def run(self): while True: if self.queue.qsize() <= 0: pass else: phases = self.queue.get() def download(self, phases): for track in phases[‘tracks‘]: file_url = self.url % (phases[‘phase‘], track[‘id‘]) local_file_dict = ‘%s/%s‘ % (self.dist, phases[‘phase‘]) if not os.path.exists(local_file_dict): os.makedirs(local_file_dict) local_file = ‘%s/%s.%s.mp3‘ % (local_file_dict, track[‘id‘], track[‘name‘]) if not os.path.isfile(local_file): print ‘downloading: ‘ + track[‘name‘] res = requests.get(file_url, proxies=random_proxies(), headers=headers) with open(local_file, ‘wb‘) as f: f.write(res.content) f.close() print ‘done.\\n‘ else: print ‘break: ‘ + track[‘name‘] if __name__ == ‘__main__‘: spider_queue = Queue.Queue() luoo = LuooSpider(luoo_site, vols=[‘680‘, ‘721‘, ‘725‘, ‘720‘],queue=spider_queue) luoo.setDaemon(True) luoo.start() downloader_count = 5 for i in range(downloader_count): luoo_download = LuooDownloader(luoo_site_mp3, ‘D:/luoo‘, queue=spider_queue) luoo_download.setDaemon(True) luoo_download.start()
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 |
#-*- coding: utf-8 -*- ‘‘‘by sudo rm -rf‘‘‘ import os import requests from bs4 import BeautifulSoup import random from faker import Factory import Queue import threading
fake = Factory.create() luoo_site = ‘‘ luoo_site_mp3 = ‘‘
proxy_ips = [ ‘‘ ] # 替换自己的代理IP headers = { ‘Connection‘: ‘keep-alive‘, ‘User-Agent‘: fake.user_agent() }
def random_proxies(): ip_index = random.randint(0, len(proxy_ips)-1) res = { ‘http‘: proxy_ips[ip_index] } return res
def fix_characters(s): for c in [‘<‘, ‘>‘, ‘:‘, ‘"‘, ‘/‘, ‘\\\\‘, ‘|‘, ‘?‘, ‘*‘]: s = s.replace(c, ‘‘) return s
class LuooSpider(threading.Thread): def __init__(self, url, vols, queue=None): threading.Thread.__init__(self) print ‘[luoo spider]‘ print ‘=‘ * 20 self.url = url self.queue = queue self.vol = ‘1‘ self.vols = vols
def run(self): for vol in self.vols: self.spider(vol) print ‘\\ncrawl end\\n\\n‘ def spider(self, vol): url = luoo_site + vol print ‘crawling: ‘ + url + ‘\\n‘ res = requests.get(url, proxies=random_proxies()) soup = BeautifulSoup(res.content, ‘html.parser‘) title = soup.find(‘span‘, attrs={‘class‘: ‘vol-title‘}).text cover = soup.find(‘img‘, attrs={‘class‘: ‘vol-cover‘})[‘src‘] desc = soup.find(‘div‘, attrs={‘class‘: ‘vol-desc‘}) track_names = soup.find_all(‘a‘, attrs={‘class‘: ‘trackname‘}) track_count = len(track_names) tracks = [] for track in track_names: _id = str(int(track.text[:2])) if (int(vol) < 12) else track.text[:2] # 12期前的音乐编号1~9是1位(如:1~9),之后的都是2位 1~9会在左边垫0(如:01~09) _name = fix_characters(track.text[4:]) tracks.append({‘id‘: _id, ‘name‘: _name}) phases = { ‘phase‘: vol, # 期刊编号 ‘title‘: title, # 期刊标题 ‘cover‘: cover, # 期刊封面 ‘desc‘: desc, # 期刊描述 ‘track_count‘: track_count, # 节目数 ‘tracks‘: tracks # 节目清单(节目编号,节目名称) } self.queue.put(phases)
class LuooDownloader(threading.Thread): def __init__(self, url, dist, queue=None): threading.Thread.__init__(self) self.url = url self.queue = queue self.dist = dist self.__counter = 0
def run(self): while True: if self.queue.qsize() <= 0: pass else: phases = self.queue.get()
def download(self, phases): for track in phases[‘tracks‘]: file_url = self.url % (phases[‘phase‘], track[‘id‘])
local_file_dict = ‘%s/%s‘ % (self.dist, phases[‘phase‘]) if not os.path.exists(local_file_dict): os.makedirs(local_file_dict)
local_file = ‘%s/%s.%s.mp3‘ % (local_file_dict, track[‘id‘], track[‘name‘]) if not os.path.isfile(local_file): print ‘downloading: ‘ + track[‘name‘] res = requests.get(file_url, proxies=random_proxies(), headers=headers) with open(local_file, ‘wb‘) as f: f.write(res.content) f.close() print ‘done.\\n‘ else: print ‘break: ‘ + track[‘name‘]
if __name__ == ‘__main__‘: spider_queue = Queue.Queue()
luoo = LuooSpider(luoo_site, vols=[‘680‘, ‘721‘, ‘725‘, ‘720‘],queue=spider_queue) luoo.setDaemon(True) luoo.start()
downloader_count = 5 for i in range(downloader_count): luoo_download = LuooDownloader(luoo_site_mp3, ‘D:/luoo‘, queue=spider_queue) luoo_download.setDaemon(True) luoo_download.start() |
通过本文我们基本了解了网络爬虫的知识,对网络爬虫工作原理认识的同时我们实现了一个真实的案例场景,这里主要是使用一些基础的第三方Python库来帮助我们实现爬虫,基本上演示了网络爬虫框架中基本的核心概念。通常工作中我们会使用一些比较优秀的爬虫框架来快速的实现需求,比如 scrapy框架,接下来我会通过使用Scrapy这类爬虫框架来实现一个新的爬虫来加深对网络爬虫的理解!
本文出自 “小公举” 博客,请务必保留此出处