抓取天猫手机评论

时间：2017-11-01 19:36:08 阅读：193 评论：0 收藏：0 [点我收藏+]

标签：odi count floor requests accept lin app user cache

import re
import json
import time
import requests
from bs4 import BeautifulSoup 
 
 
tm_headers = { 
            "scheme": "https",
            "Connection": "keep-alive",
            "Upgrade-Insecure-Requests": "1",
            "Cache-Control" : "max-age=0",
            "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
            "Accept-Encoding": "gzip, deflate, br",
            "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
            "Content-Type": "text/html"
               
        }   
 
def req(url, headers):
    soup = None
    try:
        content = requests.get(url, headers=headers, timeout=2)
        code = content.status_code
        if code == 200:
            soup = BeautifulSoup(content.text, "html.parser")
    except Exception as e:
        print("get url error, url: {0}".format(url))
    return soup

def get_phone_list():
    #  获取列表url
    phone_list = []
    list_url = "https://shouji.tmall.com/?spm=a222t.8063993.a2226c3nav.5.7b8f4da0yjyxC3&acm=lb-zebra-155904-807029.1003.4.767290&scm=1003.4.lb-zebra-155904-807029.OTHER_14592967254716_767290#J_floor12"
    soup = req(list_url, tm_headers)
    txt = soup.find_all("li", class_="focus-")
    for i in txt[:-5]:
        a = i.find("a")
        name = i.find("h3").get_text()
        href = a.get("href")
        if name != "":
            itemid = href.split("id=")[-1].split("&")[0]
            phone_list.append({"url": "https:" + href, "name": name})
    return phone_list

def create_deltail_url(url, page=1, itemid=None, sellerid=None):
    # 生成评论地址,最终发现获取评论api 参数需要两个id，itemid and sellerid，sellerid 必须去详情页拿

    if itemid is None and sellerid is None: 
        itemid = url.split("id=")[-1].split("&")[0] 
        soup = req(url, tm_headers) 
        txt = soup.find_all("meta")[-1].get("content") 
        sellerid = txt.split("userid=")[-1].replace(";", "")
    
    comment_json_url = "https://rate.tmall.com/list_detail_rate.htm?itemId={0}&sellerId={1}&currentPage={2}".format(itemid, sellerid, page)
    return comment_json_url, itemid, sellerid


def get_deltail(db, comment_json_url, itemid, sellerid, name):
    # 调用评论接口 获取评论数据
    pagenum = None
    comment_data = req(comment_json_url, tm_headers)
    if comment_data is not None:
        count = 1
        while "paginator" not in str(comment_data) and count < 5:
            comment_data = req(comment_json_url, tm_headers)
            count += 1
            time.sleep(1)
        try:
            comment_str = str(comment_data)[15:]
            comment_json = json.loads(comment_str)
        except Exception as e:
            return None
        rateList = comment_json["rateList"]
        for item in rateList:
            data = {}
            data["itemid"] = itemid
            data["usernick"] = item["displayUserNick"]
            data["comment_content"] = item["rateContent"]
            data["comment_date"] = item["rateDate"]
            data["sellerid"] = sellerid    
            # insert db
        pagenum = comment_json["paginator"]["lastPage"]
    return pagenum



if __name__ == "__main__":
    phone_list = get_phone_list()
    for phone_url in phone_list:
        name = phone_url["name"]
        url = phone_url["url"]
        print("开始抓取: {0}  手机, 页码: {1}".format(name, 1))
        comment_json_url, itemid, sellerid = create_deltail_url(url)
        pagenum = get_deltail(db, comment_json_url, itemid, sellerid, name)
        if pagenum is not None:
            page = 2
            while page < pagenum:
                print("开始抓取: {0} 手机, 页码: {1}".format(name, page))
                comment_json_url, itemid, sellerid = create_deltail_url(phone_url["url"], page, itemid, sellerid)
                get_deltail(db, comment_json_url, itemid, sellerid, name)
                page += 1
                time.sleep(2)

抓取天猫手机评论

标签：odi count floor requests accept lin app user cache

原文地址：http://www.cnblogs.com/dockers/p/7767914.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行