码迷,mamicode.com
首页 > 其他好文 > 详细

爬取知乎话题async使用协程

时间:2018-08-03 18:47:11      阅读:183      评论:0      收藏:0      [点我收藏+]

标签:mac   ade   tps   aging   order   cti   task   fse   cio   

import requests
import json
import time
from pyquery import PyQuery
import pandas as pd
from collections import OrderedDict
import multiprocessing
import asyncio
from functools import partial
# cookies = input(‘请输入Cookie:‘)
# url = input(‘请输入url:‘)
init_url = https://www.zhihu.com/api/v4/topics/19562045/feeds/top_activity?offset=5&limit=10
headers = {
    User-Agent: Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1,
    Cookie: **,
    Referer: https://www.zhihu.com/topic/19606409/hot,
    Host: www.zhihu.com,
    X-UDID: AGDlzA1itw2PTr6aWsPp6OtejkxQ9iF7xgA=
}

def get_all_url(url):
    res = requests.get(url,headers=headers)
    data = json.loads(res.text)
    next_page_url = data[paging][next]
    url_list.append(next_page_url)
    print(len(url_list))
    end_page = data[paging][is_end]  # true
    if end_page:
        return url_list
    else:
        get_all_url(next_page_url)



async def get_all_data(url):
    future = loop.run_in_executor(None,partial(requests.get,url,headers=headers))
    #res = requests.get(url,headers=headers)
    res = await future
    data = json.loads(res.text)
    res_data = data[data]
    print(len(data_list))
    for i in res_data:
        final_data = OrderedDict()
        type = i[target][type]
        if type ==answer:
            final_data[title] = i[target][question][title] or ‘‘
            try:
                final_data[content] = PyQuery(i[target][content]).text()
            except Exception as e:
                final_data[content] = PyQuery(i[target][excerpt]).text()
            final_data[comment_count] = i[target][comment_count]
            final_data[voteup_count] = i[target][voteup_count]
            data_list.append(final_data)

if __name__ == __main__:
    data_list=[]
    url_list = []
    get_all_url(init_url)

    tasks = [asyncio.ensure_future(get_all_data(url)) for url in url_list]
    loop = asyncio.get_event_loop()
    loop.run_until_complete(asyncio.wait(tasks))
    loop.close()

    df1 =pd.DataFrame(data_list)
    df1.to_excel(保险+time.strftime("%Y%m%d%H%M%S")+.xlsx,index=False)
    print(done)

 

爬取知乎话题async使用协程

标签:mac   ade   tps   aging   order   cti   task   fse   cio   

原文地址:https://www.cnblogs.com/Erick-L/p/9415677.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!