喜欢去知乎炸鱼?用python吧

时间：2019-05-21 12:40:32 阅读：181 评论：0 收藏：0 [点我收藏+]

标签：ams bsp for 地址 contex 跳过 set 女孩子 error

知乎高赞贴：

有一双大长腿是什么体验？

有一副迷人的身材是什么体验？

别用手机费劲的翻了，python帮你一臂之力

import re
import requests
import os
import urllib.request
import ssl

from urllib.parse import urlsplit
from os.path import basename

# 全局禁用证书验证
ssl._create_default_https_context = ssl._create_unverified_context

headers = {
    ‘User-Agent‘: "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36",
    ‘Accept-Encoding‘: ‘gzip, deflate‘
}


def mkdir(path):
    if not os.path.exists(path):
        print(‘新建文件夹：‘, path)
        os.makedirs(path)
        return True
    else:
        print(u"图片存放于：", os.getcwd() + os.sep + path)
        return False


def download_pic2(img_lists, dir_name):
    print("一共有{num}张照片".format(num=len(img_lists)))

    # 标记下载进度
    index = 1

    for image_url in img_lists:
        file_name = dir_name + os.sep + basename(urlsplit(image_url)[2])

        # 已经下载的文件跳过
        if os.path.exists(file_name):
            print("文件{file_name}已存在。".format(file_name=file_name))
            index += 1
            continue

        auto_download(image_url, file_name)

        print("下载{pic_name}完成！({index}/{sum})".format(pic_name=file_name, index=index, sum=len(img_lists)))
        index += 1


def auto_download(url, file_name):
    # 递归下载，直到文件下载成功
    try:
        urllib.request.urlretrieve(url, file_name)
    except urllib.request.ContentTooShortError:
        print("文件下载不完整，重新下载。")
        auto_download(url, file_name)
    except urllib.request.URLError:
        print("网络连接出错，尝试重新下载。")
        auto_download(url, file_name)


def download_pic(img_lists, dir_name):
    print("一共有{num}张照片".format(num=len(img_lists)))
    for image_url in img_lists:
        response = requests.get(image_url, stream=True)
        if response.status_code == 200:
            image = response.content
        else:
            continue

        file_name = dir_name + os.sep + basename(urlsplit(image_url)[2])

        try:
            with open(file_name, "wb") as picture:
                picture.write(image)
        except IOError:
            print("IO Error\n")
            continue
        finally:
            picture.close()

        print("下载{pic_name}完成！".format(pic_name=file_name))


def get_image_url(qid, headers):
    # 利用正则表达式把源代码中的图片地址过滤出来
    # reg = r‘data-actualsrc="(.*?)">‘
    tmp_url = "https://www.zhihu.com/node/QuestionAnswerListV2"
    size = 10
    image_urls = []

    session = requests.Session()

    while True:
        postdata = {‘method‘: ‘next‘,
                    ‘params‘: ‘{"url_token":‘ + str(qid) + ‘,"pagesize": "10",‘ + ‘"offset":‘ + str(size) + "}"}
        page = session.post(tmp_url, headers=headers, data=postdata)
        ret = eval(page.text)
        answers = ret[‘msg‘]
        print(u"答案数：%d" % (len(answers)))

        size += 10

        if not answers:
            print("图片 URL 获取完毕, 页数: ", (size - 10) / 10)
            return image_urls

        # reg = r‘https://pic\d.zhimg.com/[a-fA-F0-9]{5,32}_\w+.jpg‘
        imgreg = re.compile(‘data-original="(.*?)"‘, re.S)

        for answer in answers:
            tmp_list = []
            url_items = re.findall(imgreg, answer)

            for item in url_items:  # 这里去掉得到的图片 URL 中的转义字符‘\\‘
                image_url = item.replace("\\", "")
                tmp_list.append(image_url)

            # 清理掉头像和去重 获取 data-original 的内容
            tmp_list = list(set(tmp_list))  # 去重
            for item in tmp_list:
                if item.endswith(‘r.jpg‘):
                    print(item)
                    image_urls.append(item)

        print(‘size: %d, num : %d‘ % (size, len(image_urls)))


if __name__ == ‘__main__‘:
    title = ‘拥有一副令人羡慕的好身材是怎样的体验？‘
    question_id = 297715922

    # title = ‘身材好是一种怎样的体验？‘
    # question_id = 26037846

    # title = ‘女孩子胸大是什么体验？‘
    # question_id = 291678281

    # title = ‘女生什么样的腿是美腿？‘
    # question_id = 310786985

    # title = ‘你的择偶标准是怎样的？‘
    # question_id = 275359100

    # title = ‘什么样才叫好看的腿？‘
    # question_id = 63727821

    # title = ‘身材对女生很重要吗？‘
    # question_id = 307403214

    # title = ‘女生腿长是什么样的体验？‘
    # question_id = 273711203

    # title = ‘女生腕线过裆是怎样一种体验？‘
    # question_id = 315236887

    # title = ‘有着一双大长腿是什么感觉？‘
    # question_id = 292901966

    # title = ‘拥有一双大长腿是怎样的体验？‘
    # question_id = 285321190

    # title = ‘大胸女生如何穿衣搭配？‘
    # question_id = 26297181

    # title = ‘胸大到底怎么穿衣服好看?‘
    # question_id = 293482116

    zhihu_url = "https://www.zhihu.com/question/{qid}".format(qid=question_id)
    path = str(question_id) + ‘_‘ + title
    mkdir(path)  # 创建本地文件夹
    img_list = get_image_url(question_id, headers)  # 获取图片的地址列表
    download_pic2(img_list, path)  # 保存图片

喜欢去知乎炸鱼?用python吧

标签：ams bsp for 地址 contex 跳过 set 女孩子 error

原文地址：https://www.cnblogs.com/pyyu/p/10898998.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行