标签:
import urllib2 class UseProxy(object): def __init__(self): self.user = ‘aaaa‘ self.password = ‘bbbb‘ self.proxyserver = ‘xxx.yyy.zzz:8080‘ self.content = ‘‘ def getproxy(self): proxy = ‘http://%s:%s@%s‘ % (self.user, self.password, self.proxyserver) proxy_handler = urllib2.ProxyHandler({‘http‘: proxy}) opener = urllib2.build_opener(proxy_handler, urllib2.HTTPHandler) # self.content = opener.open(self.url).read().decode(‘utf-8‘) return opener
UseProxy
from urlparse import urljoin import re from UseProxy import * from bs4 import BeautifulSoup class GetZealerVideo(object): def __init__(self): self.url = ‘http://www.zealer.com‘ self.content = ‘‘ self.lists = [] def splitcontent(self, proxyset): # self.proxyset = UseProxy() self.content = proxyset.getproxy().open(self.url).read().decode(‘utf-8‘) # self.useproxy() soup = BeautifulSoup(self.content, "html.parser") founddiv = soup.findAll(‘div‘, {‘class‘: ‘subject‘}) foundli = soup.findAll(‘div‘, {‘id‘: re.compile("^li_layer")}) l = len(founddiv) self.lists = [] if l == len(foundli): for i in range(l): b = re.search(‘/post(/\d+)*‘, str(foundli[i])) self.lists.append(urljoin(self.url, b.group())) self.lists.append(founddiv[i].contents[0].encode(‘utf-8‘)) return self.lists if __name__ == ‘__main__‘: gvideo = GetZealerVideo() proxyset = UseProxy() print ‘.‘.join(gvideo.splitcontent(proxyset)).decode(‘utf-8‘)
GetZealerVideo
from UseProxy import * from bs4 import BeautifulSoup class GetMydrivers(object): def __init__(self): self.url = ‘http://www.mydrivers.com‘ self.content = ‘‘ self.lists = [] def splitcontent(self, proxyset): # self.useproxy() self.content = proxyset.getproxy().open(self.url).read() soup = BeautifulSoup(self.content, "html.parser", from_encoding="gb18030") print soup.original_encoding founddiv = soup.findAll(‘span‘, {‘class‘: ‘titl‘}) for i in range(len(founddiv)): self.lists.append(founddiv[i].contents[0]) return self.lists if __name__ == ‘__main__‘: gnews = GetMydrivers() proxyset = UseProxy() lists = gnews.splitcontent(proxyset) for l in lists: print str(l).decode(‘utf-8‘).encode(‘gb18030‘)
GetMydrivers
# -*- coding: utf-8 -*- from Tkinter import * from time import ctime import os import re import GetZealerVideo as soup import GetMydrivers as mnews from UseProxy import * class GetResource(object): def __init__(self): self.win = Tk() self.l1 = StringVar(self.win) self.msg = "" self.frame = Frame(width=800, height=600, bg=‘white‘) # self.frame.grid_propagate(False) # self.frame.grid() self.frame.propagate(False) self.frame.pack() self.scroll = Scrollbar(self.frame) self.scroll.pack(side=RIGHT, fill=Y) # self.scroll.grid(row=0, column=1) self.listbox = Listbox(self.frame, selectbackground=‘blue‘, font=‘12‘, heigh=550, width=750, yscrollcommand=self.scroll.set, xscrollcommand=self.scroll.set) self.listbox.pack(side=TOP, fill=BOTH) # self.listbox.grid(row=0, column=0) self.listbox.bind(‘<Double-1>‘, self.get_select) self.frame2 = Frame(width=800, height=50, bg=‘white‘) self.frame2.propagate(False) self.frame2.pack() # self.frame2.grid_propagate(False) # self.frame2.grid() Button(self.frame2, text=u‘Get Zealer‘, command=self.zealer_video).pack(expand=YES) # Button(self.frame2, text=u‘Get Zealer‘, command=self.zealer_video).grid(row=0, column=0) Button(self.frame2, text=u‘Get Mydrivers‘, command=self.my_drivers).pack(expand=YES) # Button(self.win, text=u‘Get Mydrivers‘, command=self.my_drivers).grid(row=1, column=1) def my_drivers(self): print ‘start get at:‘, ctime() self.listbox.delete(0, END) self.getm = mnews.GetMydrivers() proxyset = UseProxy() for l in self.getm.splitcontent(proxyset): s = str(l).decode(‘utf-8‘) try: self.listbox.insert(END, re.findall(r‘(?<=href=").+?(?=">)‘, s)[0]+"\r\n") self.listbox.insert(END, re.findall(r‘(?<=>).+?(?=<)‘, s)[0]+"\r\n") self.listbox.update() except IndexError: pass print ‘get done at:‘, ctime() def zealer_video(self): print ‘start get at:‘, ctime() self.listbox.delete(0, END) self.getz = soup.GetZealerVideo() proxyset = UseProxy() for l in self.getz.splitcontent(proxyset): self.listbox.insert(END, l+"\r\n") self.listbox.update() print ‘get done at:‘, ctime() def get_select(self, ev=None): self.listbox.config(selectbackground=‘red‘) print self.listbox.curselection() self.check = self.listbox.get(self.listbox.curselection()) if self.check: if re.match(‘http‘, self.check): os.startfile(self.check) def main(): d = GetResource() mainloop() if __name__ == ‘__main__‘: main()
Tkinter爬虫(Zealer、Mydrivers)--with Proxy
标签:
原文地址:http://www.cnblogs.com/guojian2080/p/4631822.html