码迷,mamicode.com
首页 > 编程语言 > 详细

python爬取京东小爱音响评论

时间:2020-04-15 00:44:12      阅读:89      评论:0      收藏:0      [点我收藏+]

标签:client   guid   local   mysq   alc   user   das   dex   cli   

import requests
from bs4 import BeautifulSoup as bs
import re
import pandas as pd
from sqlalchemy import create_engine
from pandas.io.sql import to_sql as pd_sql
import pymysql
import random
import time


# 定义pandas存入mysql函数
def pandas_to_mysql(df_data, table_name, **kwargs):
    engine = create_engine(mysql+pymysql://{}:{}@{}:{}/{}?charset={}.format(kwargs[user],kwargs[password],kwargs[host],kwargs[port],kwargs[db],kwargs[charset]))
    pd_sql(df_data, table_name, engine, index=False, if_exists=append, chunksize=10000)  # if_exists: ‘replace‘, ‘append‘
    engine.dispose()


db_local_hwdata = {user:root,
                    password:8888,
                    port: 3306, 
                    host:localhost, 
                    db: hwdata, 
                    charset: utf8mb4}

# 设置代理池
proxies = {
            http:http://120.83.110.65:9999, 
           http:http://183.146.156.209:9999, 
           http:http://123.169.34.94:9999,
           http:http://117.57.90.141:9999,
           http:http://117.69.201.19:9999,
           http:http://117.95.192.240:9999,
           http:http://117.95.192.240:9999,
           http:http://163.204.244.161:9999,
           http:http://163.204.246.168:9999,
           http:http://171.35.160.62:9999,
           http:http://36.22.77.186:9999,
           http:http://120.83.107.184:9999,
          }

url = https://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv33766&productId=5239477&score=0&sortType=5&page=0&pageSize=10&isShadowSku=0&fold=1  # 商品链接
headers = {
    Referer: https://item.jd.com/5239477.html,
    User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36}
result = requests.get(url, headers=headers, proxies=proxies)
result_text = result.text
result_text = result_text.replace(\\n, ‘‘)
comment_number = re.search("commentCount":([\d]+),, result_text, re.S).group(1)  # 所有评论的数量,用于分页,每页评论数量为10

# 遍历,10个评论一页
for page_i in range(0, int(int(comment_number)/10)):
    time.sleep(random.uniform(0.6, 4))
    url = fhttps://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv33766&productId=5239477&score=0&sortType=5&page={page_i}&pageSize=10&isShadowSku=0&fold=1
    result = requests.get(url, headers=headers, proxies=proxies)
    result_text = result.text
    result_text = result_text.replace(\\n, ‘‘)

    analyse_result = re.findall("guid":"(.*?)",.*?"content":"(.*?)",.*?"creationTime":"(.*?)".*?"isDelete":(.*?),.*?"isTop":(.*?),.*?"replyCount":(.*?),.*?"score":(.*?),.*?"imageStatus":(.*?),.*?"usefulVoteCount":(.*?),.*?"userClient":(.*?),.*?"imageCount":(.*?),.*?"anonymousFlag":(.*?),.*?"plusAvailable":(.*?),.*?"productColor":"(.*?)".*?"imageIntegral".*?,(.*?)"status".*?,"referenceTime":"(.*?)".*?nickname":"(.*?)".*?"days":(.*?),.*?"afterDays":(.*?)}, result_text, re.S)
    final_data = []
    
    for rt in analyse_result:
        person_data = []
        for ch_i, character in enumerate(rt):
            if ch_i == 14 and (character != ‘‘):
                try:
                    character = re.search("content":"(.*?)",, character, re.S).group(1)
                except:
                    character = ‘‘
            if ch_i == 18 and (int(character)< int(rt[-2])):
                character = ‘‘
            person_data.append(character)
        final_data.append(person_data)
    dfanalyse_result = pd.DataFrame(final_data, columns=[commentUrl, comment, commentTime, isDelete, isTop, replyCount, stars, imageFlag, voteCount, userClient, imageCount, anonymousFlag, plusFlag, productCatergory, afterComment, purchaseTime, userName, commentAfterBuyingTime, afterCommentAfterBuyingTime])  # 将数据转换为DataFrame格式
    pandas_to_mysql(dfanalyse_result, w20191212jingdong_comments, **db_local_hwdata)  # 存入mysql
    print(fpage_i{page_i} finished!)

 

python爬取京东小爱音响评论

标签:client   guid   local   mysq   alc   user   das   dex   cli   

原文地址:https://www.cnblogs.com/jaysonteng/p/12702295.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!