亚马逊美国Lightning_Deals爬虫

时间：2018-06-12 12:54:53 阅读：797 评论：0 收藏：0 [点我收藏+]
标签：attr cep lstat max-age pes ice error 页面 cache
包含秒杀进度、距离结束时间、当前时间、商品标题、翻译后的标题、品牌、品牌是否有先关的备案注册信息、ASIN、Date first listed on Amazon、star、review、rank
删除了较多注释, 复制后能不用随缘。
import csv
import json
import time
import redis
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from googletrans import Translator


# import requests.packages.urllib3.util.ssl_
# requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS = ‘ALL‘

def trademark(goods_brand):
    if goods_brand == None:
        brand_register = ‘未知‘
        return brand_register
    if goods_brand == ‘null‘:
        brand_register = ‘未知‘
        return brand_register
    r = redis.Redis(host=‘127.0.0.1‘, port=6379, db=0, decode_responses=True, password=XXXXX)
    redis_brand = r.get(goods_brand)
    if redis_brand == ‘Registered‘:
        return redis_brand
    if redis_brand == None or redis_brand == "unregistered":
        ff_option = Options()
        ff_option.add_argument(‘-headless‘)
        browser = webdriver.Chrome(‘C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe‘, options=ff_option)
        browser.get(‘http://tmsearch.uspto.gov‘)
        browser.find_element_by_xpath(‘/html/body/center/table[1]/tbody/tr[2]/td/font/font/a‘).click()
        # 向文本框中传入待查询的品牌
        browser.find_element_by_name("p_s_PARA2").send_keys(goods_brand)
        # 点击查询按钮
        browser.find_element_by_xpath("//input[@onclick=‘changeCurlyQuote();‘]").click()
        # 找到查询商标结果页面的title
        # 如果查询结果页的title为“TESS -- Error”则判定改商标未注册
        # 已注册用True表示，未注册用False表示
        register_html = browser.page_source
        if len(register_html) > 920:
            brand_register = ‘Registered‘
        else:
            brand_register = ‘unregistered‘
        r.set(goods_brand, brand_register)
        browser.quit()
        return brand_register



def send_request(url, headers, proxies, session):
    flag = False
    while not flag:
        try:
            response = session.get(url, headers=headers, proxies=proxies, verify=False)
            return response
        except Exception as E:
            print(E)
            print(‘失败，正在重新尝试。‘)
            continue
        flag = True


def rank(goods_soup):
    goods_rank_li = goods_soup.find(‘li‘, id=‘SalesRank‘)
    if goods_rank_li:
        goods_rank = goods_rank_li.text.strip().replace("\n", ‘‘).replace(
            ".zg_hrsr { margin: 0; padding: 0; list-style-type: none; }.zg_hrsr_item { margin: 0 0 0 10px; }.zg_hrsr_rank { display: inline-block; width: 80px; text-align: right; }",
            "").replace("Amazon Best Sellers Rank:", ‘‘).replace("Amazon Bestsellers Rank: ", "").replace("Best Sellers Rank", "").strip().replace("#", ‘\n#‘)
        return goods_rank
    else:
        goods_rank_table = goods_soup.find(‘table‘, id=‘productDetails_detailBullets_sections1‘)
        if goods_rank_table:
            goods_rank_table_tr = goods_rank_table.find_all(‘tr‘)
            for tr in goods_rank_table_tr:
                if tr.find(‘th‘).text.strip().replace(" ", ‘‘) == ‘BestSellersRank‘:
                    goods_rank = tr.find(‘td‘).text.replace("\n", ‘‘).replace(
                        ".zg_hrsr { margin: 0; padding: 0; list-style-type: none; }.zg_hrsr_item { margin: 0 0 0 10px; }.zg_hrsr_rank { display: inline-block; width: 80px; text-align: right; }",
                        "").replace("Amazon Best Sellers Rank:", ‘‘).replace("Amazon Bestsellers Rank: ", "").replace("Best Sellers Rank", "").strip().replace("#", ‘\n#‘)
                    return goods_rank


def title(goods_soup):
    goods_title_span = goods_soup.find(‘span‘, id=‘productTitle‘)
    if goods_title_span:
        goods_title = goods_title_span.text.strip().replace("\xa0", ‘‘).replace(",", ‘‘)
        return goods_title
    else:
        goods_title = ‘null‘
        return goods_title


def brand(goods_soup):
    goods_brand_a = goods_soup.find(‘a‘, id=‘bylineInfo‘)
    if goods_brand_a:
        goods_brand = goods_brand_a.text.strip().replace("\xa0", ‘‘)
        return goods_brand
    else:
        try:
            goods_brand = goods_soup.find(‘a‘, id=‘brand‘).text.strip().replace("\xa0", ‘‘)
            return goods_brand
        except AttributeError:
            goods_brand = ‘null‘
            return goods_brand


def star(goods_soup):
    goods_star_span = goods_soup.find(‘span‘, id=‘acrPopover‘)
    if goods_star_span:
        goods_star = goods_star_span.find(‘i‘).text.split(" out of 5 stars")[0]
        return goods_star
    else:
        goods_star = ‘null‘
        return goods_star


def review(goods_soup):
    goods_review_sapn = goods_soup.find(‘span‘, id=‘acrCustomerReviewText‘)
    if goods_review_sapn:
        goods_review = goods_review_sapn.text.split(" customer reviews")[0]
        return goods_review
    else:
        goods_review = ‘null‘
        return goods_review


def price(goods_soup):
    goods_price_span = goods_soup.find(‘span‘, id=‘priceblock_dealprice‘)
    if goods_price_span:
        goods_price = goods_price_span.text.replace("$", ‘‘)
        return goods_price
    else:
        goods_price_span = goods_soup.find(‘span‘, id=‘newBuyBoxPrice‘)
        if goods_price_span:
            goods_price = goods_price_span.text.replace("$", ‘‘)
            return goods_price
        else:
            goods_price_sale = goods_soup.find(‘span‘, id=‘priceblock_saleprice‘)
            if goods_price_sale:
                goods_price = goods_price_sale.text.replace("$", ‘‘)
                return goods_price
            else:
                try:
                    goods_price = goods_soup.find(‘span‘, id=‘priceblock_ourprice‘).text.replace("$", ‘‘)
                    return goods_price
                except AttributeError:
                    goods_price = ‘null‘
                    return goods_price


def date(goods_soup):
    goods_date_div = goods_soup.find(‘div‘, id=‘detailBullets_feature_div‘)
    if goods_date_div:
        goods_date_all_li = goods_date_div.find_all(‘li‘)
        for li in goods_date_all_li:
            li_title_span = li.find(‘span‘, class_=‘a-text-bold‘)
            if li_title_span:
                li_title = li_title_span.text.strip()

                if li_title == ‘Date first listed on Amazon:‘:
                    goods_date = li.text.strip().replace("Date first listed on Amazon:", ‘‘).strip()
                    return goods_date
    else:
        goods_date_table = goods_soup.find(‘table‘, id=‘productDetails_detailBullets_sections1‘)
        if goods_date_table:
            goods_date_table_tr = goods_date_table.find_all(‘tr‘)
            for tr in goods_date_table_tr:
                if tr.find(‘th‘).text.strip().replace(" ", ‘‘) == ‘DatefirstlistedonAmazon‘:
                    goods_date = tr.find(‘td‘).text.replace("\n", ‘‘).strip()
                    return goods_date


def translator(title):
    translator = Translator(service_urls=[‘translate.google.cn‘])
    after_title = translator.translate(‘%s‘ % title, src=‘de‘, dest="zh-CN")
    translation_title = str(after_title).split("text=")[1].replace(", pronunciation=None)", ‘‘)
    return translation_title


def run(page):
    print("当前页码为：%s" % page)
    # url = ‘http://webapi.http.zhimacangku.com/getip?num=1&type=1&pro=&city=0&yys=0&port=11&time=1&ts=0&ys=0&cs=0&lb=4&sb=0&pb=4&mr=1&regions=‘
    # proxy = requests.get(url).text.replace("\n", ‘‘)
    # proxies = {
    #     "http": "http://" + proxy,
    #     ‘https‘: ‘https://‘ + proxy
    # }
    base_url = ‘https://www.amazon.com/dp/‘
    headers = {
        ‘accept-encoding‘: ‘gzip, deflate, br‘,
        # ‘accept-language‘: ‘zh-CN,zh;q=0.9‘,
        ‘upgrade-insecure-requests‘: ‘1‘,
        ‘user-agent‘: "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36",
        ‘accept‘: ‘text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8‘,
        ‘cache-control‘: ‘max-age=0‘,
        ‘authority‘: ‘www.amazon.com‘,
        # ‘cookie‘: ‘session-id=140-0436092-5114916; session-id-time=2082787201l; ubid-main=134-7819260-0954509; x-wl-uid=1D2qwOkkELPrF8q/YqteGc9JYBV5ARtF3Mc2jW/s32idPGnjnZ7sbB5wHyMnR/u9Sw34fPkbO2xA=; session-token=1c+rOmuhW1M6euftwY4+w/swVPHn3AudZXzHSqu/xcF4uMyS946ZcjCM2If+kp/T4sOC1KoBEXBODEkbBGmd9AbT7XagZlu0xFW9tti1p/z0xCUCzs5/GOrqAi7knU6259ewXpjCRgotqeM8IgNTqV1AXCu/yCq/9abumda60iIUqCTSnaleSMeEU1l25LK0Y4FnXzmTNEgJJLvLtbgLF6Hnw7uyO2qTA2xB7uIB/ZZlC+TvrnOtd3cGa7jV6MHc; s_nr=1526623968134-New; s_vnum=1958623968135^%^26vn^%^3D1; s_dslv=1526623968135; lc-main=en_US; x-amz-captcha-1=1527334802791799; x-amz-captcha-2=kU9AAR92z09BFOUgsoQgXw==; skin=noskin; csm-hit=tb:SDXEVFJG2SJZZRQZDRAF+s-SDXEVFJG2SJZZRQZDRAF^|1527472665005^&adb:adblk_no‘,
    }

    ff_option = Options()
    ff_option.add_argument(‘-headless‘)
    browser = webdriver.Chrome(‘C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe‘, chrome_options=ff_option)
    browser.get(‘https://www.amazon.com/gp/goldbox/ref=gbps_ftr_s-4_d724_page_‘ + str(
        page) + ‘?gb_f_deals1=dealStates:AVAILABLE%252CWAITLIST%252CWAITLISTFULL%252CEXPIRED%252CSOLDOUT%252CUPCOMING,page:‘ + str(page) + ‘,dealTypes:LIGHTNING_DEAL,dealsPerPage:48‘)
    time.sleep(10)
    pageSource = browser.page_source
    page_soup = BeautifulSoup(pageSource, ‘lxml‘)
    all_goods_div = page_soup.find(‘div‘, id=‘widgetContent‘).find_all(‘div‘, class_=‘a-section dealContainer‘)
    print(len(all_goods_div))
    for goods_div in all_goods_div:
        session = requests.session()
        # 给每个商品生成一个空列表
        goods_info_list = []
        schedule_div = goods_div.find(‘div‘, ‘a-column a-span5 a-text-left unitLineHeight‘)

        # 进度条
        if schedule_div:
            schedule = schedule_div.find(‘div‘, ‘a-row unitLineHeight‘).text.strip().replace(" Claimed", ‘‘).replace("\xa0", ‘‘).replace("\xae", ‘‘).replace("\u2122", ‘‘)
        else:
            schedule = ‘null‘

        # 距离结束
        timer = goods_div.find(‘span‘, role=‘timer‘)
        if timer:
            end_time = timer.text.strip().replace("\xa0", ‘‘).replace("\xae", ‘‘).replace("\u2122", ‘‘)
        else:
            end_time = ‘null‘
        now_time = time.strftime(‘%H:%M:%S‘, time.localtime(time.time()))

        dealtitle = goods_div.find(‘a‘, id=‘dealImage‘)
        # 取到商品详情页url
        try:
            goods_asin = dealtitle[‘href‘].split("dp/")[1].split(‘/‘)[0]
        except IndexError:
            continue
        goods_url = base_url + goods_asin
        goods_html = requests.get(goods_url, headers=headers)
        goods_soup = BeautifulSoup(goods_html.text, ‘lxml‘)
        print("商品链接为：" + goods_url)
        goods_asin = goods_url.split(‘dp/‘)[1]
        goods_title = title(goods_soup)
        after_title = translator(goods_title)
        goods_brand = brand(goods_soup)
        goods_star = star(goods_soup)
        goods_review = review(goods_soup)
        goods_price = price(goods_soup)
        goods_rank = rank(goods_soup)
        goods_date = date(goods_soup)
        brand_register = trademark(goods_brand)
        if goods_date == None:
            goods_date = ‘null‘
        print("schedule:" + schedule)
        print("goods_title:" + goods_title)
        print("after_title:" + after_title)
        print("goods_asin:" + goods_asin)
        print("goods_brand:" + goods_brand)
        print("brand_register:" + brand_register)
        print("goods_date:" + str(goods_date))
        print("goods_star:" + goods_star)
        print("end_time:" + end_time)
        print("now_time:" + now_time)
        print("goods_review:" + goods_review)
        print("goods_rank:" + str(goods_rank))
        print("goods_price:" + goods_price)
        goods_info_list.append(schedule)
        goods_info_list.append(end_time)
        goods_info_list.append(now_time)
        goods_info_list.append(goods_title)
        goods_info_list.append(after_title)
        goods_info_list.append(goods_brand)
        goods_info_list.append(brand_register)
        goods_info_list.append(goods_asin)
        goods_info_list.append(goods_date)
        goods_info_list.append(goods_star)
        goods_info_list.append(goods_review)
        goods_info_list.append(goods_rank)
        goods_info_list.append(goods_price)
        print(‘=========================================‘)
        csvFile = open(‘./Lightning_Deals_US/Lightning_Deals_US_%s.csv‘ % str(page), ‘a‘, newline=‘‘, encoding=‘gb18030‘)  # 设置newline，否则两行之间会空一行
        writer = csv.writer(csvFile)
        writer.writerow(goods_info_list)
        csvFile.close()


if __name__ == ‘__main__‘:
    for i in range(1, 22):
        run(i)
技术分享图片
亚马逊美国Lightning_Deals爬虫
标签：attr cep lstat max-age pes ice error 页面 cache
原文地址：https://www.cnblogs.com/qiushi9/p/9172414.html
踩
(0)
评论一句话评论（0）
分享档案
更多>
2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)
周排行