码迷,mamicode.com
首页 > 编程语言 > 详细

python 进程/线程/协程 测试

时间:2018-05-25 00:25:32      阅读:240      评论:0      收藏:0      [点我收藏+]

标签:from   规划   随机   spawn   进程池   nal   now()   encoding   dig   

# Author: yeshengbao
# --      coding: utf-8     --
# @Time  : 2018/5/24  21:38
# 进程:如一个人拥有分身(分数数最好为cpu核心数)几乎同时进行做工
# 线程:如这个人正在烧开水,但同时又可以在烧水时间内去吃饭,和扫地,这时线程就会对其随机选择,可能还会出现地还没扫完,水就开了,但他还会扫地{这就可能出现数据丢失}。。
# 协程:这个一个比线程更小的线程非常相似,但他在执行任务时,已经被规划好了,不会就行额外的时间浪费,创建时更省资源

import datetime import requests import os import hashlib from multiprocessing import Process from lxml import etree from threading import Thread from gevent import monkey import gevent monkey.patch_socket() # 开启猴子方法, 必须加 bag = if not os.path.exists(bag): os.mkdir(bag) class DouTu(object): def __init__(self): self.url = http://www.23us.so/files/article/html/6/6926/index.html self.headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)" " Chrome/64.0.3282.186 Safari/537.36", } def md5(self, strs): stri = hashlib.md5(strs.encode(utf-8)) key = stri.hexdigest() return key def get_source(self, url, headers): try: response = requests.get(url, headers=headers, timeout=10).content return response except Exception: return self.get_source(url, headers) def get_detail_content(self, frction_detail_url): if frction_detail_url: html = self.get_source(frction_detail_url, self.headers).decode(utf-8) doc = etree.HTML(html) title = doc.xpath(.//div[@class="bdsub"]/dl/dd[1]/h1/text())[0] content = ‘‘.join(doc.xpath(.//div[@class="bdsub"]/dl/dd[@id="contents"]/text())).strip().replace(\n, ‘‘).replace(\t, ‘‘) if content: with open(bag + \\ + text.txt, a+ , encoding=utf-8)as fp: fp.write(title + : + content + \n) print(正在写入{}_{}.format(title, content)) def analysis_index(self, html): doc = etree.HTML(html) td_list = doc.xpath(.//table[@id="at"]//td[@class="L"]) thread_list = [] for td in td_list: xie = gevent.spawn(self.get_detail_content, td.xpath(./a/@href)[0]) xie.start() thread_list.append(xie) print(xie) # while True: # if len(thread_list) < 100: # 可选择开启多少线程 # th = Thread(target=self.get_detail_content, args=(td.xpath(‘./a/@href‘)[0], )) # th.start() # thread_list.append(th) # break # # else: # print(thread_list) # time.sleep(3) # for ths in thread_list: # if not ths.is_alive(): # thread_list.remove(ths) for th in thread_list: # 为保证线程或协程的运行结束 th.join() def begin_spider(self): html = self.get_source(self.url, self.headers).decode(utf-8) self.analysis_index(html) start_time = datetime.datetime.now() # 程序开始时间 doutu = DouTu() doutu.begin_spider() over_time = datetime.datetime.now() # 程序结束时间 total_time = (over_time-start_time).total_seconds() print(程序共计%s秒 % total_time) # 线程 620页 约40s # 协程 18s # 进程的用法 必须作用在 if __name__ == ‘__main__‘: 里 # thread_lists = [] # for page in range(50, 81): # while True: # if len(thread_lists) < 8: # # th = threading.Thread(target=dou.begin_by_page,args=(page,)) # th = multiprocessing.Process(target=dou.begin_by_page, args=(page,)) # th.start() # thread_lists.append(th) # break # else: # time.sleep(3) # print(thread_lists) # print(‘进程池已经满了‘) # for ths in thread_lists: # if not ths.is_alive(): # thread_lists.remove(ths) # for ths in thread_lists: # ths.join()

 

python 进程/线程/协程 测试

标签:from   规划   随机   spawn   进程池   nal   now()   encoding   dig   

原文地址:https://www.cnblogs.com/yijian001/p/9085766.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!