# coding:utf-8 import json import redis import time import requests session = requests.session() import logging.handlers import pickle import sys import re import datetime from bs4 import BeautifulSoup import sys reload(sys) sys.setdefaultencoding(‘utf8‘) r =redis.Redis(host="123.56.74.190",port=6379,password="ZBHRwlb1608") import platform sysStr = platform.system() if sysStr =="Windows": LOG_FILE_check = ‘C:\\log\\wlb\\crawler\\cic.log‘ else: LOG_FILE_check = ‘/log/wlb/crawler/cic.log‘ handler = logging.handlers.RotatingFileHandler(LOG_FILE_check, maxBytes=128 * 1024 * 1024,backupCount=10) # 实例化handler 200M 最多十个文件 fmt = ‘\n‘ + ‘%(asctime)s - %(filename)s:%(lineno)s - %(message)s‘ formatter = logging.Formatter(fmt) # 实例化formatter handler.setFormatter(formatter) # 为handler添加formatter logger = logging.getLogger(‘check‘) # 获取名为tst的logger logger.addHandler(handler) # 为logger添加handler logger.setLevel(logging.DEBUG) import datetime # 生成一年的日期 def dateRange(start, end, step=1, format="%Y-%m-%d"): strptime, strftime = datetime.datetime.strptime, datetime.datetime.strftime days = (strptime(end, format) - strptime(start, format)).days return [strftime(strptime(start, format) + datetime.timedelta(i), format) for i in xrange(0, days, step)] def spider(): from selenium import webdriver import os # 引入chromedriver.exe chromedriver = "C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe" os.environ["webdriver.chrome.driver"] = chromedriver browser = webdriver.Chrome(chromedriver) # 设置浏览器需要打开的url url = "https://www.taobao.com/" browser.get(url) time.sleep(1) browser.find_element_by_id("q").send_keys(u‘python‘) browser.find_element_by_class_name("btn-search").click() time.sleep(5) for i in range(1,100): browser.find_element_by_xpath(‘//a[@trace="srp_bottom_pagedown"]‘).click() time.sleep(15) result = browser.page_source result_replace = str(result).replace(‘\n‘,‘‘).replace(‘\r‘,‘‘).replace(‘\t‘,‘‘).replace(‘ ‘,‘‘) result_replace = re.findall(‘<divclass="pic-boxJ_MouseEneterLeaveJ_PicBox">(.*?)</div><divclass="ctx-boxJ_MouseEneterLeaveJ_IconMoreNew">(.*?)</div><divclass="rowrow-4g-clearfix">(.*?)</div></div></div>‘,result_replace) print len(result_replace) for item in result_replace: item_imgurl = re.findall(‘data-src="(.*?)"alt=‘,item[0])[0] item_name = re.findall(‘alt="(.*?)"/></a></div><divclass=‘,item[0])[0] item_loation = re.findall(‘<divclass="location">(.*?)</div>‘,item[1])[0] company_name = re.findall(‘</span></span><span>(.*?)</span></a></div><divclass="location">‘,item[1])[0] company_price = re.findall(‘<divclass="priceg_priceg_price-highlight"><span>¥</span><strong>(.*?)</strong></div>‘,item[1])[0] purchase_num = re.findall(‘<divclass="deal-cnt">(.*?)人付款</div>‘,item[1])[0] print item_imgurl print item_name print item_loation print company_name print company_price print purchase_num print "="*30 # time.sleep(1000) # 关闭浏览器 # browser.quit() spider()