码迷,mamicode.com
首页 > 编程语言 > 详细

一个python爬虫工具类

时间:2018-08-07 18:52:22      阅读:148      评论:0      收藏:0      [点我收藏+]

标签:ice   add   cat   utf8   参数   ons   dig   decorator   exception   

写了一个爬虫工具类。

# -*- coding: utf-8 -*-
# @Time    : 2018/8/7 16:29
# @Author  : cxa
# @File    : utils.py
# @Software: PyCharm
from retrying import retry
from decorators.decorators import decorator
from glom import glom
from config import headers
import datetime
import hashlib
@retry(stop_max_attempt_number=3, wait_fixed=2000, stop_max_delay=10000)
@decorator
def post_html(session,post_url:int,post_data:dict,headers=headers,timeout=30):
    ‘‘‘

    :param session: 传入session对象
    :param post_url: post请求需要的url
    :param headers: 报头信息,config模块默认提供
    :param post_data: post信息 字典类型
    :param timeout:
    :return:
    ‘‘‘
    post_req=session.post(url=post_url,headers=headers,data=post_data,timeout=timeout)
    if post_req.status_code==200:
        post_req.encoding=post_req.apparent_encoding
        return post_req

@retry(stop_max_attempt_number=3,wait_fixed=2000, stop_max_delay=10000)
@decorator
def get_response(session,url:str,headers=headers,timeout=30):
    ‘‘‘
    :param url:
    :return: return response object
    ‘‘‘
    req=session.get(url=url,headers=headers,timeout=timeout)
    if req.status_code==200:
        req.encoding=req.apparent_encoding
        return req

@decorator
def get_html(req):
    source=req.text
    return source

@decorator
def get_json(req):
    jsonstr=req.json()
    return jsonstr

@decorator
def get_xpath(req,xpathstr:str):
    ‘‘‘
    :param req:
    :param xpathstr:
    :return:
    ‘‘‘
    node=req.html.xpath(xpathstr)
    return node

@decorator
def get_json_data(jsonstr:str,pat:str):
    ‘‘‘
    #通过glom模块操作数据
    :param jsonstr:
    :param pat:
    :return:
    ‘‘‘
    item=glom(jsonstr,pat)
    return item

@decorator
def get_hash_code(key):
    value=hashlib.md5(key.encode(‘utf-8‘)).hexdigest()
    return value

@decorator
def get_datetime_from_unix(unix_time):
    unix_time_value=unix_time
    if not isinstance(unix_time_value,int):
        unix_time_value=int(unix_time)
    new_datetime=datetime.datetime.fromtimestamp(unix_time_value)
    return new_datetime

以下是装饰器decorators文件的内容

# -*- coding: utf-8 -*-
# @Time    : 2018/03/28 15:35
# @Author  : cxa
# @File    : decorators.py
# @Software: PyCharm
from functools import wraps
from logger.log import get_logger
import traceback
def decorator(func):
    @wraps(func)
    def log(*args, **kwargs):
        try:
            return func(*args, **kwargs)
        except Exception as e:
            get_logger().error("{} is error,here are details:{}".format(func.__name__,traceback.format_exc()))
    return log

以下是headers文件的内容

import random

first_num = random.randint(55, 62)
third_num = random.randint(0, 3200)
fourth_num = random.randint(0, 140)


class FakeChromeUA:
    os_type = [
                ‘(Windows NT 6.1; WOW64)‘, ‘(Windows NT 10.0; WOW64)‘, ‘(X11; Linux x86_64)‘,
                ‘(Macintosh; Intel Mac OS X 10_12_6)‘
               ]

    chrome_version = ‘Chrome/{}.0.{}.{}‘.format(first_num, third_num, fourth_num)

    @classmethod
    def get_ua(cls):
        return ‘ ‘.join([‘Mozilla/5.0‘, random.choice(cls.os_type), ‘AppleWebKit/537.36‘,
                         ‘(KHTML, like Gecko)‘, cls.chrome_version, ‘Safari/537.36‘]
                        )


headers = {
    ‘User-Agent‘: FakeChromeUA.get_ua(),
    ‘Accept-Encoding‘: ‘gzip, deflate, sdch‘,
    ‘Accept-Language‘: ‘zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3‘,
    ‘Accept‘: ‘text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8‘,
    ‘Connection‘: ‘keep-alive‘
}

以下是logger文件的内容

# -*- coding: utf-8 -*-
import os
import time
import logging
import sys
log_dir1=os.path.join(os.path.dirname(os.path.dirname(__file__)),"logs")
today = time.strftime(‘%Y%m%d‘, time.localtime(time.time()))
full_path=os.path.join(log_dir1,today)
if not os.path.exists(full_path):
    os.makedirs(full_path)
log_path=os.path.join(full_path,"t.log")
def get_logger():
     # 获取logger实例,如果参数为空则返回root logger
     logger = logging.getLogger("t")
     if not logger.handlers:
            # 指定logger输出格式
            formatter = logging.Formatter(‘%(asctime)s %(levelname)-8s: %(message)s‘)

            # 文件日志
            file_handler = logging.FileHandler(log_path,encoding="utf8")
            file_handler.setFormatter(formatter)  # 可以通过setFormatter指定输出格式

            # 控制台日志
            console_handler = logging.StreamHandler(sys.stdout)
            console_handler.formatter = formatter  # 也可以直接给formatter赋值

            # 为logger添加的日志处理器
            logger.addHandler(file_handler)
            logger.addHandler(console_handler)

            # 指定日志的最低输出级别,默认为WARN级别
            logger.setLevel(logging.INFO)
     #  添加下面一句,在记录日志之后移除句柄
     return  logger

一个python爬虫工具类

标签:ice   add   cat   utf8   参数   ons   dig   decorator   exception   

原文地址:https://www.cnblogs.com/c-x-a/p/9438587.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!