标签:beautiful web attr href 超时 proxy 自己 pat pre
# coding=utf—8 import random,headers,xmlParse from bs4 import BeautifulSoup from selenium import webdriver from selenium.webdriver.common.desired_capabilities import DesiredCapabilities from selenium.webdriver.common.proxy import ProxyType phantomjs_driver=‘C:\\phantomjs-2.1.1-windows\\bin\\phantomjs.exe‘ ips=xmlParse.get_ip_port_from_xml(‘proxy_ip.xml‘) def dynamic_load(url): desired_capabilities = DesiredCapabilities.PHANTOMJS.copy() # 从USER_AGENTS列表中随机选一个浏览器头,伪装浏览器 desired_capabilities["phantomjs.page.settings.userAgent"] = (random.choice(headers.my_headers)) # 不载入图片,爬页面速度会快很多 desired_capabilities["phantomjs.page.settings.loadImages"] = False # 利用DesiredCapabilities(代理设置)参数值,重新打开一个sessionId,我看意思就相当于浏览器清空缓存后,加上代理重新访问一次url # proxy = webdriver.Proxy() # proxy.proxy_type = ProxyType.MANUAL # proxy.http_proxy = random.choice(ips) # proxy.add_to_capabilities(desired_capabilities) # 打开带配置信息的phantomJS浏览器 # driver = webdriver.PhantomJS(executable_path=phantomjs_driver,desired_capabilities=desired_capabilities) driver = webdriver.PhantomJS(executable_path=phantomjs_driver) driver.start_session(desired_capabilities) # 隐式等待5秒,可以自己调节 driver.implicitly_wait(5) # 设置10秒页面超时返回,类似于requests.get()的timeout选项,driver.get()没有timeout选项 # 以前遇到过driver.get(url)一直不返回,但也不报错的问题,这时程序会卡住,设置超时选项能解决这个问题。 driver.set_page_load_timeout(20) # 设置10秒脚本超时时间 driver.set_script_timeout(20) driver.get(url) #next_page=driver.find_element_by_id (idd)#.get_attribute(‘href‘) #driver.get(next_page) #next_page html=BeautifulSoup(driver.page_source,‘xml‘).prettify() print html return html if __name__==‘__main__‘: url=‘http://www.chnmuseum.cn/tabid/218/Default.aspx?DynastySortID=5‘ dynamic_load(url)
标签:beautiful web attr href 超时 proxy 自己 pat pre
原文地址:http://www.cnblogs.com/by2016/p/6830695.html