码迷,mamicode.com
首页 > 其他好文 > 详细

有ip 就横行霸道

时间:2017-08-24 22:35:59      阅读:283      评论:0      收藏:0      [点我收藏+]

标签:gecko   key   int   span   option   uil   driver   append   ftime   

 

 

 

技术分享

 

 

 

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
# from selenium.webdriver.firefox.options import Options
import time
from time import sleep
import math
import random
import sys
import threading
import urllib.parse
import xlrd
import sys
import os

s = os.sep
filepath = D:\\pymine\\clean\\spider_map\\baidu_map_html_firstpage_pc
requested_file_list = []
pathDir = os.listdir(filepath)
for allDir in pathDir:
    child = os.path.join(%s%s % (filepath, allDir))
    requested_file = child.split(baidu_map_html_firstpage_pc)[1].split(&)[0].split(.html)[0]
    requested_file_list.append(requested_file)

tag_jmtool_list = [, (, -]

ua_list = []
with open(mobile_ua.txt, r, encoding=utf-8) as uafile:
    for i in uafile:
        if i.find(Mozilla) > -1:
            ua_list.append(i.replace(\n, ‘‘).strip())

ua_list_len_ = len(ua_list) - 1


def extract_name(name_):
    for i in tag_jmtool_list:
        name_ = name_.split(i)[0]
    return name_


target_type_list = [住宅小区, 写字楼]
# target_type_list = [‘住宅小区‘]
target_type_list = [专科医院]
target_type_list = [商场]
requested_type_counter = 0
# 商场 4705 酒店 24915 专科医院 2513 商圈 334
target_dic = {}
# target_city_list = [‘北京市‘, ‘上海市‘, ‘深圳市‘, ‘广州市‘]
target_city_list = [深圳市, 广州市]
target_city_list = [深圳市]
file_name = JMTool任务_csv_py_wholeCSV
FEXCEL = %s%s % (file_name, .xlsx)
data = xlrd.open_workbook(FEXCEL)
table = data.sheets()[0]
nrows, ncols = table.nrows, table.ncols
res_dic, counter_ = {}, 0
for i in range(0, nrows):
    l = table.row_values(i)
    dbid, area_code, ref_area_type_code, city, district, address, city_street, name_, emp_, emp_1 = l
    if city not in target_city_list:
        continue
    type_ = ref_area_type_code
    if type_ not in target_type_list:
        continue
    name_reduction = extract_name(name_)
    if len(name_reduction) < 3:
        name_reduction = name_
    if city not in target_dic:
        target_dic[city] = {}
    if district not in target_dic[city]:
        target_dic[city][district] = {}
    if type_ not in target_dic[city][district]:
        target_dic[city][district][type_] = {}
    if name_reduction not in target_dic[city][district]:
        target_dic[city][district][type_][name_reduction] = {}
        target_dic[city][district][type_][name_reduction][name_reduction_list] = []
        target_dic[city][district][type_][name_reduction][history_list] = []
    try:
        target_dic[city][district][type_][name_reduction][name_reduction_list].append(name_)
        target_dic[city][district][type_][name_reduction][history_list].append(l)
    except Exception:
        print(Exception)


def write_res_html(browser, dir_=baidu_map_html_firstpage_pc/):
    close_alert(browser)
    current_url_ = urllib.parse.unquote(browser.current_url)
    try:
        input_ = current_url_.split(&wd=)[1].split(/?)[0]
    except Exception:
        print(Exception-, __file__, sys._getframe().f_lineno, current_url_)
        return
    current_url_ = %s%s%s % (<!--, browser.current_url, -->)
    page_source = %s%s % (current_url_, browser.page_source)
    # localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())
    # file_name = ‘%s%s%s%s‘ % (dir_, input_, localtime_, ‘.html‘)
    file_name = %s%s%s % (dir_, input_, .html)
    fo = open(file_name, w, encoding=utf-8)
    fo.write(page_source)
    fo.closed
    print(OK-writed-, sys._getframe().f_lineno, ‘‘)


def gen_random_letter():
    return chr(random.randint(97, 122))


def gen_random_num():
    return random.randint(0, 10)


def gen_sougo_pid():
    res_ = ‘‘
    for i in range(1, 17, 1):
        if i in [1, 3, 4, 15]:
            res_ = %s%s % (res_, gen_random_letter())
        else:
            res_ = %s%s % (res_, gen_random_num())
    return res_


def close_alert(browser, attitude=accept):
    # js=‘alert(window.alert=function(str){return;}‘
    # browser.execute_script(js)

    # js= ‘window.alert = function(str){return ;}‘
    # browser.execute_script(js)
    return
    # try:
    #     al = browser.switch_to.alert()
    #     sleep(1)
    #     al.dismiss()
    #     # if attitude == ‘accept‘:
    #     #     al.accept()
    #     # elif attitude == ‘dismiss‘:
    #     #     al.dismiss()
    #     print(sys._getframe().f_lineno, ‘alert-closed-ok‘)
    # except Exception:
    #     print(sys._getframe().f_lineno, Exception, ‘no-alert‘)


# input_ = ‘深圳市南山区荟芳园‘
# browser = webdriver.Firefox()

browser = webdriver.Chrome()
def mobile_mobile_pages_html(input_):
    # mobile_emulation = {
    #     "deviceMetrics": {"width": 360, "height": 640, "pixelRatio": 3.0},
    #     "userAgent": "Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 5 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19"}
    # ua_list_index = random.randint(0, ua_list_len_)
    # mobile_emulation = {
    #     "deviceMetrics": {"width": 360, "height": 640, "pixelRatio": 3.0}}
    #
    # mobile_emulation[‘userAgent‘] = ua_list[ua_list_index]
    # chrome_options = Options()
    # chrome_options.add_experimental_option("mobileEmulation", mobile_emulation)
    # browser = webdriver.Chrome(chrome_options=chrome_options)


    # sleep(4)
    # # url_seed = ‘http://m.sogou.com/web/searchList.jsp?pid=sogou-mobb-%s-0007&keyword=百度地图‘ % (gen_sougo_pid())
    # url_seed = ‘%s%s%s‘ % (
    #     ‘https://www.sogou.com/web?query=%E7%99%BE%E5%BA%A6%E5%9C%B0%E5%9B%BE&_asf=www.‘, gen_sougo_pid(),
    #     ‘.com&_ast=&w=01019900&p=40040100&ie=utf8&from=index-nologin&s_from=index&sut=4774&sst0=1503482714549&lkt=3%2C1503482710643%2C1503482710997&sugsuv=00DA6D7D7789371D599A889761488522&sugtime=1503482714549‘)
    #
    # print(url_seed)
    # browser.get(url_seed)
    # close_alert(browser)
    # js = ‘%s%s%s‘ % (‘document.getElementsByClassName("input-text js_input")[0].value="‘, input_, ‘"‘)
    # browser.execute_script(js)
    # close_alert(browser)

    url_ = %s%s % (http://map.baidu.com/?s=s%26wd%3D, input_)
    browser.get(url_)
    # js=‘window.alert=function(str){return;}‘
    # browser.execute_script(js)
    # xp_newpage = ‘//*[@id="sogou_vr_21384401_wrap_0"]/div[1]/div/div[2]/div/form/input[2]‘
    # browser.find_element_by_xpath(xp_newpage).click()
    #
    # js = ‘window.alert=function(str){return;}‘
    # browser.execute_script(js)
    sleep(2)

    write_res_html(browser)

    #browser.quit()


class MyThread(threading.Thread):
    def __init__(self, func, args, name):
        threading.Thread.__init__(self)
        self.name, self.func, self.args = name, func, args

    def run(self):
        self.func(self.args)


def thread_city_district(city_district):
    global requested_type_counter
    city, district = city_district.split(_)
    for type_ in target_dic[city][district]:
        for name_reduction in target_dic[city][district][type_]:
            for name_ in target_dic[city][district][type_][name_reduction][name_reduction_list]:
                input_ = %s%s%s % (city, district, name_)
                if input_ in requested_file_list:
                    requested_type_counter += 1
                    print(requested_type_counter=, requested_type_counter, input_)
                    continue
                mobile_mobile_pages_html(input_)


threads_list = []
for city in target_dic:
    for district in target_dic[city]:
        city_district = %s_%s % (city, district)
        thread_instance = MyThread(thread_city_district, (city_district), thread_city_district.__name__)
        threads_list.append(thread_instance)
for t in threads_list:
    t.setDaemon = False
    t.start()
for t in threads_list:
    t.join()

 

有ip 就横行霸道

标签:gecko   key   int   span   option   uil   driver   append   ftime   

原文地址:http://www.cnblogs.com/yuanjiangw/p/7425355.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!