python 简单的爬虫

时间：2018-11-30 13:55:50 阅读：180 评论：0 收藏：0 [点我收藏+]

标签：调用函数 ISE 正则 return 结果 utf-8 编译 sts os.path

import urllib.request
import re
import ssl  # 处理https请求
import time
import os  # 创建目录用


def get_html(url):
    page = urllib.request.urlopen(url)
    html = page.read()  # 返回的是 <class ‘bytes‘> 需要转码为字符串类型
    html = html.decode(‘utf-8‘)  # 返回的是 <class ‘str‘>
    return html


reg = ‘src="(.+?\.jpg)" width‘  # 正则表达式
reg_img = re.compile(reg)  # 编译一下，运行更快
ssl._create_default_https_context = ssl._create_unverified_context  # 因为爬虫对象是https链接，导入一个ssl模块就可以解决问题
imglist = reg_img.findall(get_html(‘http://tieba.baidu.com/p/1753935195‘))  # 进行匹配


def mkdir(path):
    # 去除首位空格
    path = path.strip()
    # 去除尾部 \ 符号
    path = path.rstrip("\\")

    # 判断路径是否存在
    # 存在     True
    # 不存在   False
    isExists = os.path.exists(path)

    # 判断结果
    if not isExists:
        # 如果不存在则创建目录
        # 创建目录操作函数
        os.makedirs(path)
        print(path + ‘ 创建成功‘)
        return True
    else:
        # 如果目录存在则不创建，并提示目录已存在
        print(path + ‘ 目录已存在‘)
        return False


# 定义要创建的目录
mkpath = "picture"
# 调用函数
picture = mkdir(mkpath)

x = 0
for img in imglist:
    urllib.request.urlretrieve(img, mkpath+‘/%s.jpg‘ % time.time())
    x += 1

print("图片下载完成")

python 简单的爬虫

标签：调用函数 ISE 正则 return 结果 utf-8 编译 sts os.path

原文地址：https://www.cnblogs.com/dengnapianhuahai/p/10043118.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行