码迷,mamicode.com
首页 > 其他好文 > 详细

亚马逊美国Lightning_Deals爬虫

时间:2018-06-12 12:54:53      阅读:797      评论:0      收藏:0      [点我收藏+]

标签:attr   cep   lstat   max-age   pes   ice   error   页面   cache   

包含秒杀进度、距离结束时间、当前时间、商品标题、翻译后的标题、品牌、品牌是否有先关的备案注册信息、ASIN、Date first listed on Amazon、star、review、rank

删除了较多注释, 复制后能不用随缘

import csv
import json
import time
import redis
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from googletrans import Translator


# import requests.packages.urllib3.util.ssl_
# requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS = ‘ALL‘

def trademark(goods_brand):
    if goods_brand == None:
        brand_register = ‘未知‘
        return brand_register
    if goods_brand == ‘null‘:
        brand_register = ‘未知‘
        return brand_register
    r = redis.Redis(host=‘127.0.0.1‘, port=6379, db=0, decode_responses=True, password=XXXXX)
    redis_brand = r.get(goods_brand)
    if redis_brand == ‘Registered‘:
        return redis_brand
    if redis_brand == None or redis_brand == "unregistered":
        ff_option = Options()
        ff_option.add_argument(‘-headless‘)
        browser = webdriver.Chrome(‘C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe‘, options=ff_option)
        browser.get(‘http://tmsearch.uspto.gov‘)
        browser.find_element_by_xpath(‘/html/body/center/table[1]/tbody/tr[2]/td/font/font/a‘).click()
        # 向文本框中传入待查询的品牌
        browser.find_element_by_name("p_s_PARA2").send_keys(goods_brand)
        # 点击查询按钮
        browser.find_element_by_xpath("//input[@onclick=‘changeCurlyQuote();‘]").click()
        # 找到查询商标结果页面的title
        # 如果查询结果页的title为“TESS -- Error”则判定改商标未注册
        # 已注册用True表示,未注册用False表示
        register_html = browser.page_source
        if len(register_html) > 920:
            brand_register = ‘Registered‘
        else:
            brand_register = ‘unregistered‘
        r.set(goods_brand, brand_register)
        browser.quit()
        return brand_register



def send_request(url, headers, proxies, session):
    flag = False
    while not flag:
        try:
            response = session.get(url, headers=headers, proxies=proxies, verify=False)
            return response
        except Exception as E:
            print(E)
            print(‘失败,正在重新尝试。‘)
            continue
        flag = True


def rank(goods_soup):
    goods_rank_li = goods_soup.find(‘li‘, id=‘SalesRank‘)
    if goods_rank_li:
        goods_rank = goods_rank_li.text.strip().replace("\n", ‘‘).replace(
            ".zg_hrsr { margin: 0; padding: 0; list-style-type: none; }.zg_hrsr_item { margin: 0 0 0 10px; }.zg_hrsr_rank { display: inline-block; width: 80px; text-align: right; }",
            "").replace("Amazon Best Sellers Rank:", ‘‘).replace("Amazon Bestsellers Rank: ", "").replace("Best Sellers Rank", "").strip().replace("#", ‘\n#‘)
        return goods_rank
    else:
        goods_rank_table = goods_soup.find(‘table‘, id=‘productDetails_detailBullets_sections1‘)
        if goods_rank_table:
            goods_rank_table_tr = goods_rank_table.find_all(‘tr‘)
            for tr in goods_rank_table_tr:
                if tr.find(‘th‘).text.strip().replace(" ", ‘‘) == ‘BestSellersRank‘:
                    goods_rank = tr.find(‘td‘).text.replace("\n", ‘‘).replace(
                        ".zg_hrsr { margin: 0; padding: 0; list-style-type: none; }.zg_hrsr_item { margin: 0 0 0 10px; }.zg_hrsr_rank { display: inline-block; width: 80px; text-align: right; }",
                        "").replace("Amazon Best Sellers Rank:", ‘‘).replace("Amazon Bestsellers Rank: ", "").replace("Best Sellers Rank", "").strip().replace("#", ‘\n#‘)
                    return goods_rank


def title(goods_soup):
    goods_title_span = goods_soup.find(‘span‘, id=‘productTitle‘)
    if goods_title_span:
        goods_title = goods_title_span.text.strip().replace("\xa0", ‘‘).replace(",", ‘‘)
        return goods_title
    else:
        goods_title = ‘null‘
        return goods_title


def brand(goods_soup):
    goods_brand_a = goods_soup.find(‘a‘, id=‘bylineInfo‘)
    if goods_brand_a:
        goods_brand = goods_brand_a.text.strip().replace("\xa0", ‘‘)
        return goods_brand
    else:
        try:
            goods_brand = goods_soup.find(‘a‘, id=‘brand‘).text.strip().replace("\xa0", ‘‘)
            return goods_brand
        except AttributeError:
            goods_brand = ‘null‘
            return goods_brand


def star(goods_soup):
    goods_star_span = goods_soup.find(‘span‘, id=‘acrPopover‘)
    if goods_star_span:
        goods_star = goods_star_span.find(‘i‘).text.split(" out of 5 stars")[0]
        return goods_star
    else:
        goods_star = ‘null‘
        return goods_star


def review(goods_soup):
    goods_review_sapn = goods_soup.find(‘span‘, id=‘acrCustomerReviewText‘)
    if goods_review_sapn:
        goods_review = goods_review_sapn.text.split(" customer reviews")[0]
        return goods_review
    else:
        goods_review = ‘null‘
        return goods_review


def price(goods_soup):
    goods_price_span = goods_soup.find(‘span‘, id=‘priceblock_dealprice‘)
    if goods_price_span:
        goods_price = goods_price_span.text.replace("$", ‘‘)
        return goods_price
    else:
        goods_price_span = goods_soup.find(‘span‘, id=‘newBuyBoxPrice‘)
        if goods_price_span:
            goods_price = goods_price_span.text.replace("$", ‘‘)
            return goods_price
        else:
            goods_price_sale = goods_soup.find(‘span‘, id=‘priceblock_saleprice‘)
            if goods_price_sale:
                goods_price = goods_price_sale.text.replace("$", ‘‘)
                return goods_price
            else:
                try:
                    goods_price = goods_soup.find(‘span‘, id=‘priceblock_ourprice‘).text.replace("$", ‘‘)
                    return goods_price
                except AttributeError:
                    goods_price = ‘null‘
                    return goods_price


def date(goods_soup):
    goods_date_div = goods_soup.find(‘div‘, id=‘detailBullets_feature_div‘)
    if goods_date_div:
        goods_date_all_li = goods_date_div.find_all(‘li‘)
        for li in goods_date_all_li:
            li_title_span = li.find(‘span‘, class_=‘a-text-bold‘)
            if li_title_span:
                li_title = li_title_span.text.strip()

                if li_title == ‘Date first listed on Amazon:‘:
                    goods_date = li.text.strip().replace("Date first listed on Amazon:", ‘‘).strip()
                    return goods_date
    else:
        goods_date_table = goods_soup.find(‘table‘, id=‘productDetails_detailBullets_sections1‘)
        if goods_date_table:
            goods_date_table_tr = goods_date_table.find_all(‘tr‘)
            for tr in goods_date_table_tr:
                if tr.find(‘th‘).text.strip().replace(" ", ‘‘) == ‘DatefirstlistedonAmazon‘:
                    goods_date = tr.find(‘td‘).text.replace("\n", ‘‘).strip()
                    return goods_date


def translator(title):
    translator = Translator(service_urls=[‘translate.google.cn‘])
    after_title = translator.translate(‘%s‘ % title, src=‘de‘, dest="zh-CN")
    translation_title = str(after_title).split("text=")[1].replace(", pronunciation=None)", ‘‘)
    return translation_title


def run(page):
    print("当前页码为:%s" % page)
    # url = ‘http://webapi.http.zhimacangku.com/getip?num=1&type=1&pro=&city=0&yys=0&port=11&time=1&ts=0&ys=0&cs=0&lb=4&sb=0&pb=4&mr=1&regions=‘
    # proxy = requests.get(url).text.replace("\n", ‘‘)
    # proxies = {
    #     "http": "http://" + proxy,
    #     ‘https‘: ‘https://‘ + proxy
    # }
    base_url = ‘https://www.amazon.com/dp/‘
    headers = {
        ‘accept-encoding‘: ‘gzip, deflate, br‘,
        # ‘accept-language‘: ‘zh-CN,zh;q=0.9‘,
        ‘upgrade-insecure-requests‘: ‘1‘,
        ‘user-agent‘: "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36",
        ‘accept‘: ‘text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8‘,
        ‘cache-control‘: ‘max-age=0‘,
        ‘authority‘: ‘www.amazon.com‘,
        # ‘cookie‘: ‘session-id=140-0436092-5114916; session-id-time=2082787201l; ubid-main=134-7819260-0954509; x-wl-uid=1D2qwOkkELPrF8q/YqteGc9JYBV5ARtF3Mc2jW/s32idPGnjnZ7sbB5wHyMnR/u9Sw34fPkbO2xA=; session-token=1c+rOmuhW1M6euftwY4+w/swVPHn3AudZXzHSqu/xcF4uMyS946ZcjCM2If+kp/T4sOC1KoBEXBODEkbBGmd9AbT7XagZlu0xFW9tti1p/z0xCUCzs5/GOrqAi7knU6259ewXpjCRgotqeM8IgNTqV1AXCu/yCq/9abumda60iIUqCTSnaleSMeEU1l25LK0Y4FnXzmTNEgJJLvLtbgLF6Hnw7uyO2qTA2xB7uIB/ZZlC+TvrnOtd3cGa7jV6MHc; s_nr=1526623968134-New; s_vnum=1958623968135^%^26vn^%^3D1; s_dslv=1526623968135; lc-main=en_US; x-amz-captcha-1=1527334802791799; x-amz-captcha-2=kU9AAR92z09BFOUgsoQgXw==; skin=noskin; csm-hit=tb:SDXEVFJG2SJZZRQZDRAF+s-SDXEVFJG2SJZZRQZDRAF^|1527472665005^&adb:adblk_no‘,
    }

    ff_option = Options()
    ff_option.add_argument(‘-headless‘)
    browser = webdriver.Chrome(‘C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe‘, chrome_options=ff_option)
    browser.get(‘https://www.amazon.com/gp/goldbox/ref=gbps_ftr_s-4_d724_page_‘ + str(
        page) + ‘?gb_f_deals1=dealStates:AVAILABLE%252CWAITLIST%252CWAITLISTFULL%252CEXPIRED%252CSOLDOUT%252CUPCOMING,page:‘ + str(page) + ‘,dealTypes:LIGHTNING_DEAL,dealsPerPage:48‘)
    time.sleep(10)
    pageSource = browser.page_source
    page_soup = BeautifulSoup(pageSource, ‘lxml‘)
    all_goods_div = page_soup.find(‘div‘, id=‘widgetContent‘).find_all(‘div‘, class_=‘a-section dealContainer‘)
    print(len(all_goods_div))
    for goods_div in all_goods_div:
        session = requests.session()
        # 给每个商品生成一个空列表
        goods_info_list = []
        schedule_div = goods_div.find(‘div‘, ‘a-column a-span5 a-text-left unitLineHeight‘)

        # 进度条
        if schedule_div:
            schedule = schedule_div.find(‘div‘, ‘a-row unitLineHeight‘).text.strip().replace(" Claimed", ‘‘).replace("\xa0", ‘‘).replace("\xae", ‘‘).replace("\u2122", ‘‘)
        else:
            schedule = ‘null‘

        # 距离结束
        timer = goods_div.find(‘span‘, role=‘timer‘)
        if timer:
            end_time = timer.text.strip().replace("\xa0", ‘‘).replace("\xae", ‘‘).replace("\u2122", ‘‘)
        else:
            end_time = ‘null‘
        now_time = time.strftime(‘%H:%M:%S‘, time.localtime(time.time()))

        dealtitle = goods_div.find(‘a‘, id=‘dealImage‘)
        # 取到商品详情页url
        try:
            goods_asin = dealtitle[‘href‘].split("dp/")[1].split(‘/‘)[0]
        except IndexError:
            continue
        goods_url = base_url + goods_asin
        goods_html = requests.get(goods_url, headers=headers)
        goods_soup = BeautifulSoup(goods_html.text, ‘lxml‘)
        print("商品链接为:" + goods_url)
        goods_asin = goods_url.split(‘dp/‘)[1]
        goods_title = title(goods_soup)
        after_title = translator(goods_title)
        goods_brand = brand(goods_soup)
        goods_star = star(goods_soup)
        goods_review = review(goods_soup)
        goods_price = price(goods_soup)
        goods_rank = rank(goods_soup)
        goods_date = date(goods_soup)
        brand_register = trademark(goods_brand)
        if goods_date == None:
            goods_date = ‘null‘
        print("schedule:" + schedule)
        print("goods_title:" + goods_title)
        print("after_title:" + after_title)
        print("goods_asin:" + goods_asin)
        print("goods_brand:" + goods_brand)
        print("brand_register:" + brand_register)
        print("goods_date:" + str(goods_date))
        print("goods_star:" + goods_star)
        print("end_time:" + end_time)
        print("now_time:" + now_time)
        print("goods_review:" + goods_review)
        print("goods_rank:" + str(goods_rank))
        print("goods_price:" + goods_price)
        goods_info_list.append(schedule)
        goods_info_list.append(end_time)
        goods_info_list.append(now_time)
        goods_info_list.append(goods_title)
        goods_info_list.append(after_title)
        goods_info_list.append(goods_brand)
        goods_info_list.append(brand_register)
        goods_info_list.append(goods_asin)
        goods_info_list.append(goods_date)
        goods_info_list.append(goods_star)
        goods_info_list.append(goods_review)
        goods_info_list.append(goods_rank)
        goods_info_list.append(goods_price)
        print(‘=========================================‘)
        csvFile = open(‘./Lightning_Deals_US/Lightning_Deals_US_%s.csv‘ % str(page), ‘a‘, newline=‘‘, encoding=‘gb18030‘)  # 设置newline,否则两行之间会空一行
        writer = csv.writer(csvFile)
        writer.writerow(goods_info_list)
        csvFile.close()


if __name__ == ‘__main__‘:
    for i in range(1, 22):
        run(i)

技术分享图片

亚马逊美国Lightning_Deals爬虫

标签:attr   cep   lstat   max-age   pes   ice   error   页面   cache   

原文地址:https://www.cnblogs.com/qiushi9/p/9172414.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!