码迷,mamicode.com
首页 > Web开发 > 详细

Ajax数据爬取

时间:2020-03-23 20:19:06      阅读:87      评论:0      收藏:0      [点我收藏+]

标签:交互式   request   开发技术   spl   size   情况   cross   kmp   enc   

 

 

    Ajax 即“Asynchronous Javascript And XML”(异步 JavaScript 和 XML),是指一种创建交互式、快速动态网页应用的网页开发技术,无需重新加载整个网页的情况下,能够更新部分网页的技术。

 

    通过在后台与服务器进行少量数据交换,Ajax 可以使网页实现异步更新。这意味着可以在不重新加载整个网页的情况下,对网页的某部分进行更新。

 

 

 

1. 爬取微博页面Ajax数据

技术图片
import requests
from urllib.parse import urlencode
from pyquery import PyQuery as pq
import json, pymongo

def get_ajax_page(page):
    headers = {
        Host: weibo.com,
        Referer: https://weibo.com/1461280777/Iz3Iqx2wG?ref=feedsdk&type=comment,
        User-Agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36,
        X-Requested-With: XMLHttpRequest,
        Cookie: SINAGLOBAL=7735058780719.93.1582184597719; _s_tentry=zz.253.com; Apache=6823476113810.396.1584424118910; ULV=1584424118929:5:1:1:6823476113810.396.1584424118910:1582854530521; SUB=_2AkMpLTzuf8NxqwJRmP8dzGLgbIxxywvEieKfcc01JRMxHRl-yT92qnAFtRB6Aq0SASvP3fxjV-YYDUSQSyRek7uE3A6b; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WF39BLl0OFppYFLW7GUd5Zl; login_sid_t=8687f896b60dd07aaa80e41d83159a89; cross_origin_proto=SSL; Ugrow-G0=6fd5dedc9d0f894fec342d051b79679e; YF-V5-G0=2583080cfb7221db1341f7a137b6762e; wb_view_log=1366*7681; UOR=zz.253.com,widget.weibo.com,www.baidu.com; YF-Page-G0=d30fd7265234f674761ebc75febc3a9f|1584511608|1584511567
    }
    url = https://weibo.com/aj/v6/comment/big
    params = {
        ajwvr: 6,
        id: 4483557667874538,
        root_comment_max_id_type: 0,
        page: page,
    }
    try:
        response = requests.get(url=url, headers=headers, params=params)
        if response.status_code == 200:
            return response.json()
            # print(type(response.json()), response.json())
    except requests.ConnectionError as e:
        print(error, e.args)


def parse_page(js):
    data = js.get(data)
    html = data.get(html)
    doc = pq(html)
    items = doc(div.list_con).items()
    for item in items:
        msg = {}
        msg[name] = item(.WB_text).text().split()[0]
        msg[content] = item(.WB_text).text().split()[1]
        msg[datetime] = item(div.WB_from.S_txt2).text()

        yield msg

def collection_mongo(host=localhost, port=27017):
    client = pymongo.MongoClient(host=host, port=port)

    return client


def save_mongo(client ,data):

    db = client.weibo
    collection = db.weibo

    if collection.insert(data):
        print(Save to mongo)

def search_mongo(client):
    db = client.weibo
    collection = db.weibo
    result = collection.find()
    return result

def main():
    for i in range(1, 11):
        js = get_ajax_page(str(i))
        results = parse_page(js)
        for result in results:
            client = collection_mongo(10.0.0.100)
            save_mongo(client, result)

if __name__ == __main__:
    # main()
    client = collection_mongo(10.0.0.100)
    data = search_mongo(client)
    for item in data:
        print(item)
View Code

 

 

 

2. Ajax爬取头条街拍图片

技术图片
import requests
from urllib.parse import urlencode
from pyquery import PyQuery as pq
import json, pymongo

def get_ajax_page(page):
    headers = {
        Host: weibo.com,
        Referer: https://weibo.com/1461280777/Iz3Iqx2wG?ref=feedsdk&type=comment,
        User-Agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36,
        X-Requested-With: XMLHttpRequest,
        Cookie: SINAGLOBAL=7735058780719.93.1582184597719; _s_tentry=zz.253.com; Apache=6823476113810.396.1584424118910; ULV=1584424118929:5:1:1:6823476113810.396.1584424118910:1582854530521; SUB=_2AkMpLTzuf8NxqwJRmP8dzGLgbIxxywvEieKfcc01JRMxHRl-yT92qnAFtRB6Aq0SASvP3fxjV-YYDUSQSyRek7uE3A6b; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WF39BLl0OFppYFLW7GUd5Zl; login_sid_t=8687f896b60dd07aaa80e41d83159a89; cross_origin_proto=SSL; Ugrow-G0=6fd5dedc9d0f894fec342d051b79679e; YF-V5-G0=2583080cfb7221db1341f7a137b6762e; wb_view_log=1366*7681; UOR=zz.253.com,widget.weibo.com,www.baidu.com; YF-Page-G0=d30fd7265234f674761ebc75febc3a9f|1584511608|1584511567
    }
    url = https://weibo.com/aj/v6/comment/big
    params = {
        ajwvr: 6,
        id: 4483557667874538,
        root_comment_max_id_type: 0,
        page: page,
    }
    try:
        response = requests.get(url=url, headers=headers, params=params)
        if response.status_code == 200:
            return response.json()
            # print(type(response.json()), response.json())
    except requests.ConnectionError as e:
        print(error, e.args)


def parse_page(js):
    data = js.get(data)
    html = data.get(html)
    doc = pq(html)
    items = doc(div.list_con).items()
    for item in items:
        msg = {}
        msg[name] = item(.WB_text).text().split()[0]
        msg[content] = item(.WB_text).text().split()[1]
        msg[datetime] = item(div.WB_from.S_txt2).text()

        yield msg

def collection_mongo(host=localhost, port=27017):
    client = pymongo.MongoClient(host=host, port=port)

    return client


def save_mongo(client ,data):

    db = client.weibo
    collection = db.weibo

    if collection.insert(data):
        print(Save to mongo)

def search_mongo(client):
    db = client.weibo
    collection = db.weibo
    result = collection.find()
    return result

def main():
    for i in range(1, 11):
        js = get_ajax_page(str(i))
        results = parse_page(js)
        for result in results:
            client = collection_mongo(10.0.0.100)
            save_mongo(client, result)

if __name__ == __main__:
    # main()
    client = collection_mongo(10.0.0.100)
    data = search_mongo(client)
    for item in data:
        print(item)
View Code

 

Ajax数据爬取

标签:交互式   request   开发技术   spl   size   情况   cross   kmp   enc   

原文地址:https://www.cnblogs.com/Caiyundo/p/12554341.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!