标签:from 规划 随机 spawn 进程池 nal now() encoding dig
# Author: yeshengbao # -- coding: utf-8 -- # @Time : 2018/5/24 21:38 # 进程:如一个人拥有分身(分数数最好为cpu核心数)几乎同时进行做工
# 线程:如这个人正在烧开水,但同时又可以在烧水时间内去吃饭,和扫地,这时线程就会对其随机选择,可能还会出现地还没扫完,水就开了,但他还会扫地{这就可能出现数据丢失}。。
# 协程:这个一个比线程更小的线程非常相似,但他在执行任务时,已经被规划好了,不会就行额外的时间浪费,创建时更省资源
import datetime import requests import os import hashlib from multiprocessing import Process from lxml import etree from threading import Thread from gevent import monkey import gevent monkey.patch_socket() # 开启猴子方法, 必须加 bag = ‘書‘ if not os.path.exists(bag): os.mkdir(bag) class DouTu(object): def __init__(self): self.url = ‘http://www.23us.so/files/article/html/6/6926/index.html‘ self.headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)" " Chrome/64.0.3282.186 Safari/537.36", } def md5(self, strs): stri = hashlib.md5(strs.encode(‘utf-8‘)) key = stri.hexdigest() return key def get_source(self, url, headers): try: response = requests.get(url, headers=headers, timeout=10).content return response except Exception: return self.get_source(url, headers) def get_detail_content(self, frction_detail_url): if frction_detail_url: html = self.get_source(frction_detail_url, self.headers).decode(‘utf-8‘) doc = etree.HTML(html) title = doc.xpath(‘.//div[@class="bdsub"]/dl/dd[1]/h1/text()‘)[0] content = ‘‘.join(doc.xpath(‘.//div[@class="bdsub"]/dl/dd[@id="contents"]/text()‘)).strip().replace(‘\n‘, ‘‘).replace(‘\t‘, ‘‘) if content: with open(bag + ‘\\‘ + ‘text.txt‘, ‘a+‘ , encoding=‘utf-8‘)as fp: fp.write(title + ‘ :‘ + content + ‘\n‘) print(‘正在写入{}_{}‘.format(title, content)) def analysis_index(self, html): doc = etree.HTML(html) td_list = doc.xpath(‘.//table[@id="at"]//td[@class="L"]‘) thread_list = [] for td in td_list: xie = gevent.spawn(self.get_detail_content, td.xpath(‘./a/@href‘)[0]) xie.start() thread_list.append(xie) print(xie) # while True: # if len(thread_list) < 100: # 可选择开启多少线程 # th = Thread(target=self.get_detail_content, args=(td.xpath(‘./a/@href‘)[0], )) # th.start() # thread_list.append(th) # break # # else: # print(thread_list) # time.sleep(3) # for ths in thread_list: # if not ths.is_alive(): # thread_list.remove(ths) for th in thread_list: # 为保证线程或协程的运行结束 th.join() def begin_spider(self): html = self.get_source(self.url, self.headers).decode(‘utf-8‘) self.analysis_index(html) start_time = datetime.datetime.now() # 程序开始时间 doutu = DouTu() doutu.begin_spider() over_time = datetime.datetime.now() # 程序结束时间 total_time = (over_time-start_time).total_seconds() print(‘程序共计%s秒‘ % total_time) # 线程 620页 约40s # 协程 18s # 进程的用法 必须作用在 if __name__ == ‘__main__‘: 里 # thread_lists = [] # for page in range(50, 81): # while True: # if len(thread_lists) < 8: # # th = threading.Thread(target=dou.begin_by_page,args=(page,)) # th = multiprocessing.Process(target=dou.begin_by_page, args=(page,)) # th.start() # thread_lists.append(th) # break # else: # time.sleep(3) # print(thread_lists) # print(‘进程池已经满了‘) # for ths in thread_lists: # if not ths.is_alive(): # thread_lists.remove(ths) # for ths in thread_lists: # ths.join()
标签:from 规划 随机 spawn 进程池 nal now() encoding dig
原文地址:https://www.cnblogs.com/yijian001/p/9085766.html