xcvx

时间：2016-11-06 16:45:15 阅读：200 评论：0 收藏：0 [点我收藏+]

标签：dex 反向谷歌 item body 服务器 tmp insert port

# -*- coding: utf-8 -*-
from selenium import webdriver
from lxml.etree import HTML
import sqlite3, time, os

driver = webdriver.Firefox()
driver.implicitly_wait(30)#?浏览器等30s
verificationErrors = []#?
accept_next_alert = True#?
driver.get(‘https://www.telegeography.com/products/global-bandwidth-research-service/data/submarine-cable-profiles/index.html‘)
driver.find_element_by_id("username").clear()
driver.find_element_by_id("username").send_keys("user123")
driver.find_element_by_id("password").clear()
driver.find_element_by_id("password").send_keys("giant987")
driver.find_element_by_id("login-submit").click()
##############################################################
# 创建并连接数据库
if os.path.exists(‘telegeography.db‘):
    conn = sqlite3.connect(‘telegeography.db‘)
    cur = conn.cursor()
else:
    conn = sqlite3.connect(‘telegeography.db‘)
    cur = conn.cursor()
    cur.execute(‘CREATE TABLE OwersTB (CableName VARCHAR (300), Owners VARCHAR (300), Percentage VARCHAR (10), PRIMARY KEY (CableName, Owners));‘)
    cur.execute(‘CREATE TABLE CapacityTB (CableName VARCHAR (300), CableNamePart VARCHAR (300), Time VARCHAR (15), Lit_Fiber_Pairs VARCHAR (15), Per_Fiber_Pair VARCHAR (15), Gbps_per_Wavelength VARCHAR (15), "Total_Capacity(Gbps)" VARCHAR (15), PRIMARY KEY (CableName, CableNamePart, Time));‘)
    cur.execute(‘CREATE TABLE urlsTB (url VARCHAR (300) PRIMARY KEY);‘)
html = HTML(driver.page_source)
# 已爬取过的网址
exists_urls = cur.execute(‘SELECT url FROM urlsTB‘).fetchall()
exists_urls = [i[0] for i in exists_urls]
# tmp = []
# for i in exists_urls:
#     tmp.append(i[0])
# exists_urls = tmp
##############################################################
urls = html.xpath(‘//*[@id="content"]/table[@class="wide"]/tbody/tr/td/a/@href‘)
for i in urls:
    if str(i) in exists_urls: continue   # 如果已爬取该网页就跳过
    URL = ‘https://www.telegeography.com/products/global-bandwidth-research-service/data/submarine-cable-profiles/‘ + i
    driver.get(URL)
    time.sleep(1)
    html = HTML(driver.page_source)
    Owers_Array = html.xpath(‘//*[@id="content"]//div[@class="owners"]/table/tbody/tr‘)
    # Capacity_Array = html.xpath(‘//*[@id="content"]//div[@class="cable_capacity"]/table/tbody/tr‘)
    Capacity_Array = html.xpath(‘//*[@id="content"]//div[@class="cable_capacity"]/table/tbody/tr‘)
    try:
        if Owers_Array:  # 判断是否有元素
            for item in Owers_Array:
                cur.execute(‘INSERT INTO OwersTB VALUES (?,?,?);‘,
                            (
                                html.xpath(‘//title/text()‘)[0].strip(),
                                item.xpath(‘td/text()‘)[0].strip(),
                                item.xpath(‘td/text()‘)[1].strip()
                            ))
        if Capacity_Array:   # 判断是否有元素
            for item in Capacity_Array[:-2]:
                tmp  = item.xpath(‘td/text()‘)
                cur.execute(
                            ‘INSERT INTO CapacityTB VALUES (?,?,?,?,?,?,?);‘,
                            (
                                html.xpath(‘//title/text()‘)[0].strip(),
                                item.xpath(‘parent::tbody/parent::table/preceding-sibling::h3/text()‘)[-1].strip(),
                                item.xpath(‘th/text()‘)[0].strip(),
                                item.xpath(‘td/text()‘)[0].strip(),
                                item.xpath(‘td/text()‘)[1].strip(),
                                item.xpath(‘td/text()‘)[2].strip(),
                                item.xpath(‘td/text()‘)[3].strip(),
                            ))
        cur.execute("INSERT INTO urlsTB VALUES (‘%s‘);" % (i))
        conn.commit()    # 未出错,提交到数据库执行
        print html.xpath(‘//title/text()‘)[0].strip()
    except:
        conn.rollback()    # 出错,回滚
        print html.xpath(‘//title/text()‘)[0].strip()
cur.close()
conn.close()

# -*- coding: utf-8 -*-
from selenium import webdriver
from lxml.etree import HTML
import sqlite3, time, os

driver = webdriver.Firefox()
driver.implicitly_wait(30)#?浏览器等30s
verificationErrors = []#?
accept_next_alert = True#?
driver.get(‘https://www.telegeography.com/products/global-bandwidth-research-service/data/submarine-cable-profiles/index.html‘)
driver.find_element_by_id("username").clear()
driver.find_element_by_id("username").send_keys("user123")
driver.find_element_by_id("password").clear()
driver.find_element_by_id("password").send_keys("giant987")
driver.find_element_by_id("login-submit").click()
##############################################################
# 创建并连接数据库
if os.path.exists(‘telegeography.db‘):
    conn = sqlite3.connect(‘telegeography.db‘)
    cur = conn.cursor()
else:
    conn = sqlite3.connect(‘telegeography.db‘)
    cur = conn.cursor()
    cur.execute(‘CREATE TABLE OwersTB (CableName VARCHAR (300), Owners VARCHAR (300), Percentage VARCHAR (10), PRIMARY KEY (CableName, Owners));‘)
    cur.execute(‘CREATE TABLE CapacityTB (CableName VARCHAR (300), CableNamePart VARCHAR (300), Time VARCHAR (15), Lit_Fiber_Pairs VARCHAR (15), Per_Fiber_Pair VARCHAR (15), Gbps_per_Wavelength VARCHAR (15), "Total_Capacity(Gbps)" VARCHAR (15), PRIMARY KEY (CableName, CableNamePart, Time));‘)
    cur.execute(‘CREATE TABLE urlsTB (url VARCHAR (300) PRIMARY KEY);‘)
html = HTML(driver.page_source)
# 已爬取过的网址
exists_urls = cur.execute(‘SELECT url FROM urlsTB‘).fetchall()
exists_urls = [i[0] for i in exists_urls]
# tmp = []
# for i in exists_urls:
#     tmp.append(i[0])
# exists_urls = tmp
##############################################################
urls = html.xpath(‘//*[@id="content"]/table[@class="wide"]/tbody/tr/td/a/@href‘)
for i in urls:
    if str(i) in exists_urls: continue   # 如果已爬取该网页就跳过
    URL = ‘https://www.telegeography.com/products/global-bandwidth-research-service/data/submarine-cable-profiles/‘ + i
    driver.get(URL)
    time.sleep(1)
    html = HTML(driver.page_source)
    Owers_Array = html.xpath(‘//*[@id="content"]//div[@class="owners"]/table/tbody/tr‘)
    # Capacity_Array = html.xpath(‘//*[@id="content"]//div[@class="cable_capacity"]/table/tbody/tr‘)
    Capacity_Array = html.xpath(‘//*[@id="content"]//div[@class="cable_capacity"]/table/tbody/tr‘)
    try:
        if Owers_Array:  # 判断是否有元素
            for item in Owers_Array:
                cur.execute(‘INSERT INTO OwersTB VALUES (?,?,?);‘,
                            (
                                html.xpath(‘//title/text()‘)[0].strip(),
                                item.xpath(‘td/text()‘)[0].strip(),
                                item.xpath(‘td/text()‘)[1].strip()
                            ))
        if Capacity_Array:   # 判断是否有元素
            for item in Capacity_Array[:-2]:
                tmp  = item.xpath(‘td/text()‘)
                cur.execute(
                            ‘INSERT INTO CapacityTB VALUES (?,?,?,?,?,?,?);‘,
                            (
                                html.xpath(‘//title/text()‘)[0].strip(),
                                item.xpath(‘parent::tbody/parent::table/preceding-sibling::h3/text()‘)[-1].strip(),
                                item.xpath(‘th/text()‘)[0].strip(),
                                item.xpath(‘td/text()‘)[0].strip(),
                                item.xpath(‘td/text()‘)[1].strip(),
                                item.xpath(‘td/text()‘)[2].strip(),
                                item.xpath(‘td/text()‘)[3].strip(),
                            ))
        cur.execute("INSERT INTO urlsTB VALUES (‘%s‘);" % (i))
        conn.commit()    # 未出错,提交到数据库执行
        print html.xpath(‘//title/text()‘)[0].strip()
    except:
        conn.rollback()    # 出错,回滚
        print html.xpath(‘//title/text()‘)[0].strip()
cur.close()
conn.close()

虽然之前挖坑的那篇已经说了一些，现在还是稍微说一说.

在模拟登录中，其实让写爬虫的人疼头就是验证码，只要能破掉验证码，那么登录不是问题.

验证码(Chaptcha)内容从英文字符和数字识别，到数字加减乘除，再到汉字的出现，后面还有12306的看图识别，到现在的新型的基于人的行为的谷歌的reCaptcha，验证码也是经历了很长时间的演变。

对于很多人而言，对验证码的印象仅仅停留在网站登陆注册时候除了必要的用户名和密码以外还需要麻烦填的一个东西，而站在网站的角度，的确，验证码的应用对用户登陆没有什么作用，但是主要是防止机器人(包括爬虫)。服务器很怕大规模的请求，被百度或者谷歌这些大型搜索引擎收录当然是不错的，但是网上除了这两大搜索引擎以外还有许许多多爬虫，做的好的爬虫对服务器没有这么大的负担是极好的，关键是有很多其他的各种各样的爬虫，甚至会出现暴走的爬虫，这些爬虫都会对服务器造成很大的负担，而且并不会给网站带来受益.

不过讲真，验证码其实更重要的作用，并不是反爬虫的存在，尽管反爬虫场景中会经常见到验证码,但是Captcha应该属于人工智能范畴，是一种区别人和机器的存在，而由于这是计算机考察人类，所以有时候被称为’反向图灵测试’

对付验证码如果还是在cookie时代做用户登陆验证来说，分析验证码只要研究怎么从网页交互中找到从服务器发到终端的验证码字符串生成方式然后直接把这串字符串组合再发往服务器就可以了，没有必要对生成的验证码图片进行处理和识别，但是后面变成了Session 验证，所有的验证过程都发生在服务器，在终端只能拿到验证码图片，而没有验证码的字符串的存在，所以验证码图片的识别是避不开的坎，因为我们除非黑掉服务器，否则我们能利用的就只有验证码的图片了.

至于爬虫怎么拿到验证码图片，模拟浏览器的办法就是，selenium的截图功能，可以这么写：

xcvx

标签：dex 反向谷歌 item body 服务器 tmp insert port

原文地址：http://www.cnblogs.com/try2016/p/6035409.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行