标签:thread command click dbid values bug print nts max
bug
import sys
import os
curPath = os.path.abspath(os.path.dirname(__file__))
rootPath = os.path.split(curPath)[0]
sys.path.append(rootPath)
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
# from selenium.webdriver.firefox.options import Options
import time
from time import sleep
import math
import random
import sys
import threading
import urllib.parse
import xlrd
import sys
import os
import sqlite3
start_time, MAX_TIME = time.time(), 60
def py_stop_update_db():
db = ‘py_bdspider_status.db‘
db = ‘%s\\%s‘ % (curPath, db)
conn = sqlite3.connect(db)
pyname = os.path.basename(__file__).split(‘.py‘)[0]
sql_ = ‘%s%s%s‘ % (‘UPDATE pystatus_table SET pystatus =2 WHERE pyname="‘, pyname, ‘"‘)
print(sql_)
conn.execute(sql_)
conn.commit()
conn.close()
def chk_time():
if time.time() - start_time > MAX_TIME:
py_stop_update_db()
browser.delete_all_cookies()
browser.quit()
save_dir = ‘baidu_map_html_firstpage_pc_not_shop‘
filepath = ‘%s\\%s‘ % (curPath, save_dir)
# ‘D:\\pymine\\clean\\spider_map\\baidu_map_html_firstpage_pc_not_shop‘
requested_file_list = []
pathDir = os.listdir(filepath)
for allDir in pathDir:
child = os.path.join(‘%s%s‘ % (filepath, allDir))
requested_file = child.split(save_dir)[1].split(‘&‘)[0].split(‘.html‘)[0]
requested_file_list.append(requested_file)
tag_jmtool_list = [‘(‘, ‘(‘, ‘-‘]
ua_list = []
# with open(‘mobile_ua.txt‘, ‘r‘, encoding=‘utf-8‘) as uafile:
# for i in uafile:
# if i.find(‘Mozilla‘) > -1:
# ua_list.append(i.replace(‘\n‘, ‘‘).strip())
# ua_list_len_ = len(ua_list) - 1
def extract_name(name_):
for i in tag_jmtool_list:
name_ = name_.split(i)[0]
return name_
target_type_list = [‘住宅小区‘, ‘写字楼‘]
# target_type_list = [‘住宅小区‘]
target_type_list = [‘专科医院‘]
target_type_list = [‘商场‘]
requested_type_counter = 0
# 商场 4705 酒店 24915 专科医院 2513 商圈 334
target_dic = {}
# target_city_list = [‘北京市‘, ‘上海市‘, ‘深圳市‘, ‘广州市‘]
target_city_list = [‘深圳市‘, ‘广州市‘]
target_city_list = [‘深圳市‘]
target_city_list = [‘北京市‘, ‘上海市‘]
target_city_list = [‘北京市‘, ‘上海市‘, ‘深圳市‘, ‘广州市‘]
target_city_list = [‘北京市‘, ‘上海市‘]
target_city_list = [‘深圳市‘, ‘广州市‘]
target_city_list = [‘北京市‘]
file_name = ‘JMTool任务_csv_py_wholeCSV‘
FEXCEL = ‘%s\\%s%s‘ % (curPath, file_name, ‘.xlsx‘)
data = xlrd.open_workbook(FEXCEL)
table = data.sheets()[0]
nrows, ncols = table.nrows, table.ncols
res_dic, counter_ = {}, 0
for i in range(0, nrows):
l = table.row_values(i)
dbid, area_code, ref_area_type_code, city, district, address, city_street, name_, emp_, emp_1 = l
if city not in target_city_list:
continue
type_ = ref_area_type_code
if type_ not in target_type_list:
continue
name_reduction = extract_name(name_)
if len(name_reduction) < 3:
name_reduction = name_
if city not in target_dic:
target_dic[city] = {}
if district not in target_dic[city]:
target_dic[city][district] = {}
if type_ not in target_dic[city][district]:
target_dic[city][district][type_] = {}
if name_reduction not in target_dic[city][district]:
target_dic[city][district][type_][name_reduction] = {}
target_dic[city][district][type_][name_reduction][‘name_reduction_list‘] = []
target_dic[city][district][type_][name_reduction][‘history_list‘] = []
try:
target_dic[city][district][type_][name_reduction][‘name_reduction_list‘].append(name_)
target_dic[city][district][type_][name_reduction][‘history_list‘].append(l)
except Exception:
print(Exception)
write_res_html_dir = ‘%s\\%s\\‘ % (curPath, ‘baidu_map_html_firstpage_pc_not_shop‘)
def write_res_html(browser, dir_=write_res_html_dir):
close_alert(browser)
current_url_ = urllib.parse.unquote(browser.current_url)
try:
input_ = current_url_.split(‘&wd=‘)[1].split(‘/?‘)[0]
except Exception:
print(‘Exception-‘, __file__, sys._getframe().f_lineno, current_url_)
return
current_url_ = ‘%s%s%s‘ % (‘<!--‘, browser.current_url, ‘-->‘)
page_source = ‘%s%s‘ % (current_url_, browser.page_source)
# localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())
# file_name = ‘%s%s%s%s‘ % (dir_, input_, localtime_, ‘.html‘)
file_name = ‘%s%s%s‘ % (dir_, input_, ‘.html‘)
fo = open(file_name, ‘w‘, encoding=‘utf-8‘)
fo.write(page_source)
fo.closed
print(os.path.basename(__file__), ‘OK-writed-‘, sys._getframe().f_lineno, ‘‘)
def gen_random_letter():
return chr(random.randint(97, 122))
def gen_random_num():
return random.randint(0, 10)
def gen_sougo_pid():
res_ = ‘‘
for i in range(1, 17, 1):
if i in [1, 3, 4, 15]:
res_ = ‘%s%s‘ % (res_, gen_random_letter())
else:
res_ = ‘%s%s‘ % (res_, gen_random_num())
return res_
def close_alert(browser, attitude=‘accept‘):
# js=‘alert(window.alert=function(str){return;}‘
# browser.execute_script(js)
# js= ‘window.alert = function(str){return ;}‘
# browser.execute_script(js)
return
# try:
# al = browser.switch_to.alert()
# sleep(1)
# al.dismiss()
# # if attitude == ‘accept‘:
# # al.accept()
# # elif attitude == ‘dismiss‘:
# # al.dismiss()
# print(sys._getframe().f_lineno, ‘alert-closed-ok‘)
# except Exception:
# print(sys._getframe().f_lineno, Exception, ‘no-alert‘)
# input_ = ‘深圳市南山区荟芳园‘
# browser = webdriver.Firefox()
executable_path_str = ‘%s\\%s‘ % (curPath, ‘geckodriver.exe‘)
browser = webdriver.Firefox(executable_path=executable_path_str)
def mobile_mobile_pages_html(input_):
# mobile_emulation = {
# "deviceMetrics": {"width": 360, "height": 640, "pixelRatio": 3.0},
# "userAgent": "Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 5 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19"}
# ua_list_index = random.randint(0, ua_list_len_)
# mobile_emulation = {
# "deviceMetrics": {"width": 360, "height": 640, "pixelRatio": 3.0}}
#
# mobile_emulation[‘userAgent‘] = ua_list[ua_list_index]
# chrome_options = Options()
# chrome_options.add_experimental_option("mobileEmulation", mobile_emulation)
# browser = webdriver.Chrome(chrome_options=chrome_options)
# sleep(4)
# # url_seed = ‘http://m.sogou.com/web/searchList.jsp?pid=sogou-mobb-%s-0007&keyword=百度地图‘ % (gen_sougo_pid())
# url_seed = ‘%s%s%s‘ % (
# ‘https://www.sogou.com/web?query=%E7%99%BE%E5%BA%A6%E5%9C%B0%E5%9B%BE&_asf=www.‘, gen_sougo_pid(),
# ‘.com&_ast=&w=01019900&p=40040100&ie=utf8&from=index-nologin&s_from=index&sut=4774&sst0=1503482714549&lkt=3%2C1503482710643%2C1503482710997&sugsuv=00DA6D7D7789371D599A889761488522&sugtime=1503482714549‘)
#
# print(url_seed)
# browser.get(url_seed)
# close_alert(browser)
# js = ‘%s%s%s‘ % (‘document.getElementsByClassName("input-text js_input")[0].value="‘, input_, ‘"‘)
# browser.execute_script(js)
# close_alert(browser)
chk_time()
url_ = ‘%s%s‘ % (‘http://map.baidu.com/?s=s%26wd%3D‘, input_)
sleep(2)
browser.get(url_)
# js=‘window.alert=function(str){return;}‘
# browser.execute_script(js)
# xp_newpage = ‘//*[@id="sogou_vr_21384401_wrap_0"]/div[1]/div/div[2]/div/form/input[2]‘
# browser.find_element_by_xpath(xp_newpage).click()
#
# js = ‘window.alert=function(str){return;}‘
# browser.execute_script(js)
sleep(2)
write_res_html(browser)
# browser.quit()
class MyThread(threading.Thread):
def __init__(self, func, args, name):
threading.Thread.__init__(self)
self.name, self.func, self.args = name, func, args
def run(self):
self.func(self.args)
def thread_city_district(city_district):
global requested_type_counter
city, district = city_district.split(‘_‘)
for type_ in target_dic[city][district]:
for name_reduction in target_dic[city][district][type_]:
for name_ in target_dic[city][district][type_][name_reduction][‘name_reduction_list‘]:
input_ = ‘%s%s%s‘ % (city, district, name_)
if input_ in requested_file_list:
requested_type_counter += 1
print(‘requested_type_counter=‘, requested_type_counter, input_)
continue
mobile_mobile_pages_html(input_)
threads_list = []
for city in target_dic:
for district in target_dic[city]:
city_district = ‘%s_%s‘ % (city, district)
thread_instance = MyThread(thread_city_district, (city_district), thread_city_district.__name__)
threads_list.append(thread_instance)
for t in threads_list:
t.setDaemon = False
t.start()
for t in threads_list:
t.join()
sleep(2)
py_stop_update_db()
try:
browser.quit()
browser.delete_all_cookies()
except Exception:
print(‘last-line‘)
#! /usr/bin/env python
# coding=utf-8
import time, os, sched
import sqlite3
import sys
import random
curPath = os.path.abspath(os.path.dirname(__file__))
rootPath = os.path.split(curPath)[0]
sys.path.append(rootPath)
# 第一个参数确定任务的时间,返回从某个特定的时间到现在经历的秒数
# 第二个参数以某种人为的方式衡量时间
schedule = sched.scheduler(time.time, time.sleep)
def perform_command(cmd, inc):
# 安排inc秒后再次运行自己,即周期运行
schedule.enter(inc, 0, perform_command, (cmd, inc))
os.system(cmd)
db = ‘py_bdspider_status.db‘
db = ‘%s\\%s‘ % (curPath, db)
py_list = [‘bd1‘, ‘bd2‘, ‘bd3‘, ‘bd4‘]
py_dir = ‘D:\\pymine\\clean\\spider_map\\‘
def gen_cmd_python_str():
conn = sqlite3.connect(db)
sql = ‘SELECT * FROM pystatus_table‘
cursor = conn.execute(sql)
py_db_dic = {}
for row in cursor:
pystatus, pyname = row
py_db_dic[pyname] = pystatus
len_ = len(py_list) - 1
ii = random.randint(0, len_)
i = random.randint(0, ii)
to_requestpy = py_list[i]
if py_db_dic[to_requestpy] == 1:
if i == len_:
to_requestpy = py_list[i - 1]
else:
to_requestpy = py_list[i + 1]
sql_ = ‘%s%s%s‘ % (‘UPDATE pystatus_table SET pystatus =1 WHERE pyname="‘, to_requestpy, ‘"‘)
print(sql_)
conn.execute(sql_)
conn.commit()
conn.close()
return to_requestpy
len_ = len(py_list) - 1
def timming_exe(inc=6):
ii = random.randint(0, len_)
i = random.randint(0, ii)
to_requestpy = py_list[i]
#to_requestpy = gen_cmd_python_str()
cmd_str = ‘%s%s%s%s‘ % (‘python ‘, py_dir, to_requestpy, ‘.py‘)
schedule.enter(inc, 0, perform_command, (cmd_str, inc))
# 持续运行,直到计划时间队列变成空为止
schedule.run()
timming_exe(2)
标签:thread command click dbid values bug print nts max
原文地址:http://www.cnblogs.com/yuanjiangw/p/7429672.html