标签:keep 转化 提取 sele class ali 需要 ike one
用途:对给定的网页URL,区分可以和不可用的二级链接#!/usr/bin/env python
# -*- coding: utf-8 -*-
import requests
import bs4
import time
url = raw_input('请输入需要验证的链接:')
headers = {'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'}
#提取出网站主页
url_list = url.split('/')
r_url = '/'.join(url_list[:3])
try:
res = requests.get(url, headers = headers, timeout=3)
res.raise_for_status()
#关闭多余的连接
res.keep_alive = False
soup = bs4.BeautifulSoup(res.text, 'lxml')
a_list = soup.select('a')
for a_link in a_list:
href = a_link.get('href')
#跳过空的链接
if href == None:
continue
#把URL的相对路径转化为绝对路径
if href.startswith('http') == False:
href = r_url + href
try:
a = requests.get(href, headers = headers, timeout=3)
if a.status_code == 200:
print(('%s is ok') % (href))
a.raise_for_status()
except:
print(('%s is bad') % (href))
time.sleep(2)
except Exception,e:
print('输入的链接不可用')
print(e)标签:keep 转化 提取 sele class ali 需要 ike one
原文地址:http://blog.51cto.com/9473774/2070949