知乎高颜值图片爬取

时间：2019-03-26 13:15:35 阅读：132 评论：0 收藏：0 [点我收藏+]

标签：follow net att wrap cli turn attribute exce pat

导入相关包

import time
import pydash
import base64
import requests
from lxml import etree
from aip import AipFace
from pathlib import Path

百度云人脸检测申请信息

#唯一必须填的信息就这三行
APP_ID = "xxxxxxxx"
API_KEY = "xxxxxxxxxxxxxxxx"
SECRET_KEY = "xxxxxxxxxxxxxxxx"
# 过滤颜值阈值，存储空间大的请随意
BEAUTY_THRESHOLD = 55

AUTHORIZATION = "oauth c3cef7c66a1843f8b3a9e6a1e3160e20"
# 如果权限错误，浏览器中打开知乎，在开发者工具复制一个，无需登录
# 建议最好换一个，因为不知道知乎的反爬虫策略，如果太多人用同一个，可能会影响程序运行

以下皆无需改动

# 每次请求知乎的讨论列表长度，不建议设定太长，注意节操
LIMIT = 5
# 这是话题『美女』的 ID，其是『颜值』（20013528）的父话题
SOURCE = "19552207"

爬虫假装下正常浏览器请求

USER_AGENT = "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/534.55.3 (KHTML, like Gecko) Version/5.1.5 Safari/534.55.3"
REFERER = "https://www.zhihu.com/topic/%s/newest" % SOURCE
# 某话题下讨论列表请求 url
BASE_URL = "https://www.zhihu.com/api/v4/topics/%s/feeds/timeline_activity"
# 初始请求 url 附带的请求参数
URL_QUERY = "?include=data%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Danswer%29%5D.target.content%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%3Bdata%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Danswer%29%5D.target.is_normal%2Ccomment_count%2Cvoteup_count%2Ccontent%2Crelevant_info%2Cexcerpt.author.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Darticle%29%5D.target.content%2Cvoteup_count%2Ccomment_count%2Cvoting%2Cauthor.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Dpeople%29%5D.target.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Danswer%29%5D.target.content%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%3Bdata%5B%3F%28target.type%3Danswer%29%5D.target.author.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Darticle%29%5D.target.content%2Cauthor.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Dquestion%29%5D.target.comment_count&limit=" + str(
    LIMIT)

HEADERS = {
    "User-Agent": USER_AGENT,
    "Referer": REFERER,
    "authorization": AUTHORIZATION
}

指定 url，获取对应原始内容 / 图片

def fetch_image(url):
    try:
        response = requests.get(url, headers=HEADERS)
    except Exception as e:
        raise e
    return response.content

指定 url，获取对应 JSON 返回 / 话题列表

def fetch_activities(url):
    try:
        response = requests.get(url, headers=HEADERS)
    except Exception as e:
        raise e
    return response.json()

处理返回的话题列表

def parser_activities(datums, face_detective):
    for data in datums["data"]:
        target = data["target"]
        if "content" not in target or "question" not in target or "author" not in target:
            continue
        html = etree.HTML(target["content"])
        seq = 0
        title = target["question"]["title"]
        author = target["author"]["name"]
        images = html.xpath("//img/@src")
        for image in images:
            if not image.startswith("http"):
                continue
            image_data = fetch_image(image)
            score = face_detective(image_data)
            if not score:
                continue
            name = "{}--{}--{}--{}.jpg".format(score, author, title, seq)
            seq = seq + 1
            path = Path(__file__).parent.joinpath("image").joinpath(name)
            try:
                f = open(path, "wb")
                f.write(image_data)
                f.flush()
                f.close()
                print(path)
                time.sleep(2)
            except Exception as e:
                continue

    if not datums["paging"]["is_end"]:
        return datums["paging"]["next"]
    else:
        return None

初始化颜值检测工具

def init_detective(app_id, api_key, secret_key):
    client = AipFace(app_id, api_key, secret_key)
    options = {"face_field": "age,gender,beauty,qualities"}

    def detective(image):
        image = str(base64.b64encode(image), "utf-8")
        response = client.detect(str(image), "BASE64", options)
        response = response.get("result")
        if not response:
            return
        if (not response) or (response["face_num"] == 0):
            return
        face_list = response["face_list"]
        if pydash.get(face_list, "0.face_probability") < 0.6:
            return
        if pydash.get(face_list, "0.beauty") < BEAUTY_THRESHOLD:
            return
        if pydash.get(face_list, "0.gender.type") != "female":
            return
        score = pydash.get(face_list, "0.beauty")
        return score

    return detective

程序入口

def main():
    face_detective = init_detective(APP_ID, API_KEY, SECRET_KEY)
    url = BASE_URL % SOURCE + URL_QUERY
    while url is not None:
        datums = fetch_activities(url)
        url = parser_activities(datums, face_detective)
        time.sleep(5)

if __name__ == ‘__main__‘:
    main()

更多详情请参考文章出处知乎高颜值图片爬取

知乎高颜值图片爬取

标签：follow net att wrap cli turn attribute exce pat

原文地址：https://www.cnblogs.com/li1992/p/10599398.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行

知乎高颜值图片爬取

导入相关包

百度云 人脸检测 申请信息

以下皆无需改动

爬虫假装下正常浏览器请求

指定 url，获取对应原始内容 / 图片

指定 url，获取对应 JSON 返回 / 话题列表

处理返回的话题列表

初始化颜值检测工具

程序入口

百度云人脸检测申请信息