python_爬虫_模块

时间：2018-08-13 20:54:05 阅读：136 评论：0 收藏：0 [点我收藏+]

标签：随机 res encoding post请求 ons def 存储重复连接数

import pymysql
from urllib import request,parse
from urllib.error import HTTPError,URLError

def main(url,headers=None,data=None): # 调用函数
    if not data:
        return get_response(url,headers=headers)
    else:
        return get_response(url,headers=headers,data=data)

def get_response(url,data=None,headers=None):
    if not headers:
        headers = {‘User-Agent‘:get_agent()}
    try:
        if data:
            data = parse.urlencode(data)
            data = bytes(data,encoding=‘utf-8‘)
            req = request.Request(url, data=data, headers=headers)
        else:
            req = request.Request(url,headers=headers)
        response = request.urlopen(req)
        data = response.read().decode()
        return data # 返回数据

    except HTTPError as e: # 总的错误信息，不适合用于调试
        print(e)
    except URLError as e:
        print(e)

def get_agent(table=None): # 提前使用fake_useragent模块生成的请求头，存储在数据库中，避免出现问题无法调用fake_useragent模块
    table = ‘p_useragent‘
    conn = pymysql.connect(‘127.0.0.1‘, ‘root‘, ‘123456‘, ‘PaChong‘, charset=‘utf8‘)
    cursor = conn.cursor() # 连接数据库，随机调用请求头
    sql = ‘SELECT * FROM {} WHERE id >= ((SELECT MAX(Id) FROM {})-(SELECT MIN(Id) FROM {})) * RAND() + (SELECT MIN(Id) FROM p_useragent)  LIMIT 1‘.format(
        table, table, table)
    rwo = cursor.execute(sql)
    useragent = cursor.fetchall()[0][1]
    return useragent

if __name__ == ‘__main__‘:
    url = ‘http://fanyi.baidu.com/sug‘
    data = {‘kw‘:‘中国‘}
    import json
    res = json.loads(main(url,data=data))
    print(res)

    # url = ‘http://www.baidu.com‘
    # res = main(url)
    # print(res)

正常情况下，每写一个爬虫，都需要执行分析->请求->响应->下载(存储)的流程，但诸多功能，其实都是在重复造轮子，比如请求、调用请求头、post请求data值，可以将这些功能写到一个py文件里，这样再写其他爬虫文件时，直接调用，就可以略过输入请求头、post传参转码等诸多操作。

python_爬虫_模块

标签：随机 res encoding post请求 ons def 存储重复连接数

原文地址：https://www.cnblogs.com/hejianlong/p/9470438.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行