码迷,mamicode.com
首页 > 其他好文 > 详细

xcvx

时间:2016-11-06 16:45:15      阅读:200      评论:0      收藏:0      [点我收藏+]

标签:dex   反向   谷歌   item   body   服务器   tmp   insert   port   

# -*- coding: utf-8 -*-
from selenium import webdriver
from lxml.etree import HTML
import sqlite3, time, os

driver = webdriver.Firefox()
driver.implicitly_wait(30)#?浏览器等30s
verificationErrors = []#?
accept_next_alert = True#?
driver.get(‘https://www.telegeography.com/products/global-bandwidth-research-service/data/submarine-cable-profiles/index.html‘)
driver.find_element_by_id("username").clear()
driver.find_element_by_id("username").send_keys("user123")
driver.find_element_by_id("password").clear()
driver.find_element_by_id("password").send_keys("giant987")
driver.find_element_by_id("login-submit").click()
##############################################################
# 创建并连接数据库
if os.path.exists(‘telegeography.db‘):
conn = sqlite3.connect(‘telegeography.db‘)
cur = conn.cursor()
else:
conn = sqlite3.connect(‘telegeography.db‘)
cur = conn.cursor()
cur.execute(‘CREATE TABLE OwersTB (CableName VARCHAR (300), Owners VARCHAR (300), Percentage VARCHAR (10), PRIMARY KEY (CableName, Owners));‘)
cur.execute(‘CREATE TABLE CapacityTB (CableName VARCHAR (300), CableNamePart VARCHAR (300), Time VARCHAR (15), Lit_Fiber_Pairs VARCHAR (15), Per_Fiber_Pair VARCHAR (15), Gbps_per_Wavelength VARCHAR (15), "Total_Capacity(Gbps)" VARCHAR (15), PRIMARY KEY (CableName, CableNamePart, Time));‘)
cur.execute(‘CREATE TABLE urlsTB (url VARCHAR (300) PRIMARY KEY);‘)
html = HTML(driver.page_source)
# 已爬取过的网址
exists_urls = cur.execute(‘SELECT url FROM urlsTB‘).fetchall()
exists_urls = [i[0] for i in exists_urls]
# tmp = []
# for i in exists_urls:
# tmp.append(i[0])
# exists_urls = tmp
##############################################################
urls = html.xpath(‘//*[@id="content"]/table[@class="wide"]/tbody/tr/td/a/@href‘)
for i in urls:
if str(i) in exists_urls: continue # 如果已爬取该网页就跳过
URL = ‘https://www.telegeography.com/products/global-bandwidth-research-service/data/submarine-cable-profiles/‘ + i
driver.get(URL)
time.sleep(1)
html = HTML(driver.page_source)
Owers_Array = html.xpath(‘//*[@id="content"]//div[@class="owners"]/table/tbody/tr‘)
# Capacity_Array = html.xpath(‘//*[@id="content"]//div[@class="cable_capacity"]/table/tbody/tr‘)
Capacity_Array = html.xpath(‘//*[@id="content"]//div[@class="cable_capacity"]/table/tbody/tr‘)
try:
if Owers_Array: # 判断是否有元素
for item in Owers_Array:
cur.execute(‘INSERT INTO OwersTB VALUES (?,?,?);‘,
(
html.xpath(‘//title/text()‘)[0].strip(),
item.xpath(‘td/text()‘)[0].strip(),
item.xpath(‘td/text()‘)[1].strip()
))
if Capacity_Array: # 判断是否有元素
for item in Capacity_Array[:-2]:
tmp = item.xpath(‘td/text()‘)
cur.execute(
‘INSERT INTO CapacityTB VALUES (?,?,?,?,?,?,?);‘,
(
html.xpath(‘//title/text()‘)[0].strip(),
item.xpath(‘parent::tbody/parent::table/preceding-sibling::h3/text()‘)[-1].strip(),
item.xpath(‘th/text()‘)[0].strip(),
item.xpath(‘td/text()‘)[0].strip(),
item.xpath(‘td/text()‘)[1].strip(),
item.xpath(‘td/text()‘)[2].strip(),
item.xpath(‘td/text()‘)[3].strip(),
))
cur.execute("INSERT INTO urlsTB VALUES (‘%s‘);" % (i))
conn.commit() # 未出错,提交到数据库执行
print html.xpath(‘//title/text()‘)[0].strip()
except:
conn.rollback() # 出错,回滚
print html.xpath(‘//title/text()‘)[0].strip()
cur.close()
conn.close()
# -*- coding: utf-8 -*-
from selenium import webdriver
from lxml.etree import HTML
import sqlite3, time, os

driver = webdriver.Firefox()
driver.implicitly_wait(30)#?浏览器等30s
verificationErrors = []#?
accept_next_alert = True#?
driver.get(‘https://www.telegeography.com/products/global-bandwidth-research-service/data/submarine-cable-profiles/index.html‘)
driver.find_element_by_id("username").clear()
driver.find_element_by_id("username").send_keys("user123")
driver.find_element_by_id("password").clear()
driver.find_element_by_id("password").send_keys("giant987")
driver.find_element_by_id("login-submit").click()
##############################################################
# 创建并连接数据库
if os.path.exists(‘telegeography.db‘):
conn = sqlite3.connect(‘telegeography.db‘)
cur = conn.cursor()
else:
conn = sqlite3.connect(‘telegeography.db‘)
cur = conn.cursor()
cur.execute(‘CREATE TABLE OwersTB (CableName VARCHAR (300), Owners VARCHAR (300), Percentage VARCHAR (10), PRIMARY KEY (CableName, Owners));‘)
cur.execute(‘CREATE TABLE CapacityTB (CableName VARCHAR (300), CableNamePart VARCHAR (300), Time VARCHAR (15), Lit_Fiber_Pairs VARCHAR (15), Per_Fiber_Pair VARCHAR (15), Gbps_per_Wavelength VARCHAR (15), "Total_Capacity(Gbps)" VARCHAR (15), PRIMARY KEY (CableName, CableNamePart, Time));‘)
cur.execute(‘CREATE TABLE urlsTB (url VARCHAR (300) PRIMARY KEY);‘)
html = HTML(driver.page_source)
# 已爬取过的网址
exists_urls = cur.execute(‘SELECT url FROM urlsTB‘).fetchall()
exists_urls = [i[0] for i in exists_urls]
# tmp = []
# for i in exists_urls:
# tmp.append(i[0])
# exists_urls = tmp
##############################################################
urls = html.xpath(‘//*[@id="content"]/table[@class="wide"]/tbody/tr/td/a/@href‘)
for i in urls:
if str(i) in exists_urls: continue # 如果已爬取该网页就跳过
URL = ‘https://www.telegeography.com/products/global-bandwidth-research-service/data/submarine-cable-profiles/‘ + i
driver.get(URL)
time.sleep(1)
html = HTML(driver.page_source)
Owers_Array = html.xpath(‘//*[@id="content"]//div[@class="owners"]/table/tbody/tr‘)
# Capacity_Array = html.xpath(‘//*[@id="content"]//div[@class="cable_capacity"]/table/tbody/tr‘)
Capacity_Array = html.xpath(‘//*[@id="content"]//div[@class="cable_capacity"]/table/tbody/tr‘)
try:
if Owers_Array: # 判断是否有元素
for item in Owers_Array:
cur.execute(‘INSERT INTO OwersTB VALUES (?,?,?);‘,
(
html.xpath(‘//title/text()‘)[0].strip(),
item.xpath(‘td/text()‘)[0].strip(),
item.xpath(‘td/text()‘)[1].strip()
))
if Capacity_Array: # 判断是否有元素
for item in Capacity_Array[:-2]:
tmp = item.xpath(‘td/text()‘)
cur.execute(
‘INSERT INTO CapacityTB VALUES (?,?,?,?,?,?,?);‘,
(
html.xpath(‘//title/text()‘)[0].strip(),
item.xpath(‘parent::tbody/parent::table/preceding-sibling::h3/text()‘)[-1].strip(),
item.xpath(‘th/text()‘)[0].strip(),
item.xpath(‘td/text()‘)[0].strip(),
item.xpath(‘td/text()‘)[1].strip(),
item.xpath(‘td/text()‘)[2].strip(),
item.xpath(‘td/text()‘)[3].strip(),
))
cur.execute("INSERT INTO urlsTB VALUES (‘%s‘);" % (i))
conn.commit() # 未出错,提交到数据库执行
print html.xpath(‘//title/text()‘)[0].strip()
except:
conn.rollback() # 出错,回滚
print html.xpath(‘//title/text()‘)[0].strip()
cur.close()
conn.close()

虽然之前挖坑的那篇已经说了一些,现在还是稍微说一说.

在模拟登录中,其实让写爬虫的人疼头就是验证码,只要能破掉验证码,那么登录不是问题.

验证码(Chaptcha)内容从英文字符和数字识别,到数字加减乘除,再到汉字的出现,后面还有12306的看图识别,到现在的新型的基于人的行为的谷歌的reCaptcha,验证码也是经历了很长时间的演变。

对于很多人而言,对验证码的印象仅仅停留在网站登陆注册时候除了必要的用户名和密码以外还需要麻烦填的一个东西,而站在网站的角度,的确,验证码的应用对用户登陆没有什么作用,但是主要是防止机器人(包括爬虫)。服务器很怕大规模的请求,被百度或者谷歌这些大型搜索引擎收录当然是不错的,但是网上除了这两大搜索引擎以外还有许许多多爬虫,做的好的爬虫对服务器没有这么大的负担是极好的,关键是有很多其他的各种各样的爬虫,甚至会出现暴走的爬虫,这些爬虫都会对服务器造成很大的负担,而且并不会给网站带来受益.

不过讲真,验证码其实更重要的作用,并不是反爬虫的存在,尽管反爬虫场景中会经常见到验证码,但是Captcha应该属于人工智能范畴,是一种区别人和机器的存在,而由于这是计算机考察人类,所以有时候被称为’反向图灵测试’

对付验证码如果还是在cookie时代做用户登陆验证来说,分析验证码只要研究怎么从网页交互中找到从服务器发到终端的验证码字符串生成方式然后直接把这串字符串组合再发往服务器就可以了,没有必要对生成的验证码图片进行处理和识别,但是后面变成了Session 验证,所有的验证过程都发生在服务器,在终端只能拿到验证码图片,而没有验证码的字符串的存在,所以验证码图片的识别是避不开的坎,因为我们除非黑掉服务器,否则我们能利用的就只有验证码的图片了.

至于爬虫怎么拿到验证码图片,模拟浏览器的办法就是,selenium的截图功能,可以这么写:

xcvx

标签:dex   反向   谷歌   item   body   服务器   tmp   insert   port   

原文地址:http://www.cnblogs.com/try2016/p/6035409.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!