标签:方式 运行 lxml sel pytho header UI tree 代理
from urllib.request import Request, ProxyHandler from urllib.request import build_opener from bs4 import BeautifulSoup import MySQLdb; import redis from urllib.request import urlopen from lxml import etree from lxml import etree import re; urlfront = "http://www.xicidaili.com" url = "http://www.xicidaili.com/nn/1" result = redis.Redis(host=‘127.0.0.1‘, port=6379,db=0) # def spider_IP(url): # 获取整个页面 def get_allcode(url): # 设置代理IP proxy = {‘https‘: ‘110.73.0.45:8123‘} proxy_support = ProxyHandler(proxy); opener = build_opener(proxy_support) # 设置访问http协议头,模拟浏览器 opener.addheaders = [ (‘User-agent‘, ‘Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6‘)] r = opener.open(url) html = r.read().decode("UTF-8"); # print(html) return str(html) # lxml 方式 获取Ip def find_ip(s): # s = get_allcode(url); selector = etree.HTML(s); links = selector.xpath(‘//tr[@class="odd"]/td/text()|//tr[@class=""]/td/text()‘); ip=[] port=[] for link in links: # print(link) if ‘-‘ in link: # print() pass elif link.isdigit(): port.append(link) # f.write(link + ‘\n‘); elif ‘.‘ in link: ip.append(link) # f.write(link + ‘:‘); # 用redis 的 llist存 ip for i in range(len(ip)): # print(ip[i]+":"+port[i]) ips=ip[i] + ":" + port[i] result.lpush(‘mylist‘,ips) def get_next_page(s): selecter = etree.HTML(s); link = selecter.xpath(‘//div[@class="pagination"]/a[@class="next_page"]/@href‘); for i in link: if i == None: return None; return urlfront + i def get_allcode_ip(url,ip): # 设置代理IP try: ip=str(ip, encoding="utf-8")# bytes与str相互转换 timeout=5 proxy = {‘http‘:ip} proxy_support = ProxyHandler(proxy); opener = build_opener(proxy_support) # 设置访问http协议头,模拟浏览器 opener.addheaders = [ (‘User-agent‘, ‘Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6‘)] # 加运行超时 r = opener.open(url,None,timeout) html = r.read().decode("UTF-8"); print(‘+++++++++++++++‘) # 将可用Ip放到redis的useable_ip中 result.lpush(‘usable_ip‘,ip) print(ip) print(‘+++++++++++++++‘) except Exception as err: print(err) while 1: print(url) s=get_allcode(url); url=get_next_page(s) print(url) if url==None: break find_ip(s) while 1: ip = result.lpop(‘mylist‘) print(ip) if ip == None: break get_allcode_ip(url, ip)
标签:方式 运行 lxml sel pytho header UI tree 代理
原文地址:http://www.cnblogs.com/qieyu/p/7846110.html