码迷,mamicode.com
首页 > 其他好文 > 详细

爬虫2

时间:2018-08-27 10:25:42      阅读:108      评论:0      收藏:0      [点我收藏+]

标签:__name__   global   date   import   rac   []   return   start   class   

scrapMain.py

# -*- coding:utf-8 -*-

import os
import xlrd
import Queue
import time
from Excel_Main import Excel_Main
from ScrapData import ScrapData
from multiThread import MyThread
from write2Excel import writeRatioDate

SHARE_Q = Queue.Queue()  # 构造一个不限制大小的的队列
DATA_SET = set()         # 数据集合
_WORKER_THREAD_NUM = 4   # 设置线程的个数


def handleExcel(fileUrl=Excel.xlsx):
    ‘‘‘ 将原始 Excel.xlsx 转化为: Intelligent_analysis.xlsx

        Args: 
            file: 待转化的Excel
    ‘‘‘
    excel = Excel_Main()
    assert os.path.exists(fileUrl)
    excel.handle(fileUrl)
    print("Creat \‘Intelligent_analysis.xlsx\‘ successfully!")


def getLinks():
    ‘‘‘ 获取 Intelligent_analysis.xlsx 内的链接

        Returns:
            tcIndex_link_set: (index, link)
    ‘‘‘
    workBook = xlrd.open_workbook("Intelligent_analysis.xlsx")
    workSheet = workBook.sheets()[0]
    # 链接(log trace)位于 L 列
    tcIndex_link_set = zip(workSheet.col_values(0), workSheet.col_values(11))
    print("Get links from \‘Intelligent_analysis.xlsx\‘!")
    print(len(tcIndex_link_set))
    del tcIndex_link_set[0]
    return tcIndex_link_set


def worker() :
    """
    主要用来写工作逻辑, 只要队列不空持续处理
    队列为空时, 检查队列, 由于Queue中已经包含了wait,
    notify和锁, 所以不需要在取任务或者放任务的时候加锁解锁
    """
    global SHARE_Q
    global DATA_SET
    while not SHARE_Q.empty():

        start = time.time()
        tcIndex_link_set = SHARE_Q.get() #获得任务
        
        scrapData = ScrapData()
        tcIndex_ratio_set = scrapData.getPassRation(tcIndex_link_set)
        DATA_SET.add(tcIndex_ratio_set)
        end = time.time()

        print("<<<<<<<<<<<<<<<<<<<<<<=================>>>>>>>>>>>>>>>>>>>>>>")
        print("One job Done! PassRatio: %s | Used time: %s | TotalItem: %i" % (tcIndex_ratio_set, (end-start), len(DATA_SET)))
        
        SHARE_Q.task_done()
 

def main() :
    
    global SHARE_Q
    threads = []

    # 预处理
    handleExcel()
    tcIndex_link_set = getLinks()

    #向队列中放入任务, 真正使用时, 应该设置为可持续的放入任务
    for job in tcIndex_link_set:
        SHARE_Q.put(job)
    
    #开启_WORKER_THREAD_NUM个线程
    for i in xrange(_WORKER_THREAD_NUM):
        thread = MyThread(worker)
        thread.start()  #线程开始处理任务
        threads.append(thread)
    # for thread in threads :
    #     thread.join()

    #等待所有任务完成
    SHARE_Q.join()

    # 写入Excel.xlsx

    writeRatioDate(DATA_SET, hightlightNum=5)


if __name__ == __main__:
    startTime = time.time()
    main()
    print("Append PassRatio successfully!")
    endTime = time.time()
    print("Totally used time: %s" % (endTime-startTime))

 

爬虫2

标签:__name__   global   date   import   rac   []   return   start   class   

原文地址:https://www.cnblogs.com/charlieLeo/p/9540316.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!