码迷,mamicode.com
首页 > 其他好文 > 详细

Tkinter爬虫(Zealer、Mydrivers)--with Proxy

时间:2015-07-24 20:07:31      阅读:190      评论:0      收藏:0      [点我收藏+]

标签:

import urllib2

class UseProxy(object):
    def __init__(self):
        self.user = aaaa
        self.password = bbbb
        self.proxyserver = xxx.yyy.zzz:8080
        self.content = ‘‘

    def getproxy(self):
        proxy = http://%s:%s@%s % (self.user, self.password, self.proxyserver)
        proxy_handler = urllib2.ProxyHandler({http: proxy})
        opener = urllib2.build_opener(proxy_handler, urllib2.HTTPHandler)
        # self.content = opener.open(self.url).read().decode(‘utf-8‘)
        return opener
UseProxy

from urlparse import urljoin
import re
from UseProxy import *
from bs4 import BeautifulSoup

class GetZealerVideo(object):
    def __init__(self):
        self.url = http://www.zealer.com
        self.content = ‘‘
        self.lists = []

    def splitcontent(self, proxyset):
        # self.proxyset = UseProxy()
        self.content = proxyset.getproxy().open(self.url).read().decode(utf-8)
        # self.useproxy()
        soup = BeautifulSoup(self.content, "html.parser")
        founddiv = soup.findAll(div, {class: subject})
        foundli = soup.findAll(div, {id: re.compile("^li_layer")})
        l = len(founddiv)
        self.lists = []
        if l == len(foundli):
                for i in range(l):
                    b = re.search(/post(/\d+)*, str(foundli[i]))
                    self.lists.append(urljoin(self.url, b.group()))
                    self.lists.append(founddiv[i].contents[0].encode(utf-8))
        return self.lists
                    
if __name__ == __main__:
    gvideo = GetZealerVideo()
    proxyset = UseProxy()
    print ..join(gvideo.splitcontent(proxyset)).decode(utf-8)
GetZealerVideo

from UseProxy import *
from bs4 import BeautifulSoup

class GetMydrivers(object):
    def __init__(self):
        self.url = http://www.mydrivers.com
        self.content = ‘‘
        self.lists = []

    def splitcontent(self, proxyset):
        # self.useproxy()
        self.content = proxyset.getproxy().open(self.url).read()
        soup = BeautifulSoup(self.content, "html.parser", from_encoding="gb18030")
        print soup.original_encoding
        founddiv = soup.findAll(span, {class: titl})

        for i in range(len(founddiv)):
            self.lists.append(founddiv[i].contents[0])
        return self.lists

if __name__ == __main__:
    gnews = GetMydrivers()
    proxyset = UseProxy()
    lists = gnews.splitcontent(proxyset)
    for l in lists:
            print str(l).decode(utf-8).encode(gb18030)
GetMydrivers


# -*- coding: utf-8 -*-
from Tkinter import *
from time import ctime
import os
import re
import GetZealerVideo as soup
import GetMydrivers as mnews
from UseProxy import *

class GetResource(object):
    def __init__(self):
        self.win = Tk()

        self.l1 = StringVar(self.win)
        self.msg = ""
        self.frame = Frame(width=800, height=600, bg=white)
        # self.frame.grid_propagate(False)
        # self.frame.grid()
        self.frame.propagate(False)
        self.frame.pack()

        self.scroll = Scrollbar(self.frame)
        self.scroll.pack(side=RIGHT, fill=Y)
        # self.scroll.grid(row=0, column=1)
        self.listbox = Listbox(self.frame, selectbackground=blue, font=12, heigh=550, width=750, yscrollcommand=self.scroll.set,
                               xscrollcommand=self.scroll.set)
        self.listbox.pack(side=TOP, fill=BOTH)
        # self.listbox.grid(row=0, column=0)
        self.listbox.bind(<Double-1>, self.get_select)

        self.frame2 = Frame(width=800, height=50, bg=white)
        self.frame2.propagate(False)
        self.frame2.pack()
        # self.frame2.grid_propagate(False)
        # self.frame2.grid()
        Button(self.frame2, text=uGet Zealer, command=self.zealer_video).pack(expand=YES)
        # Button(self.frame2, text=u‘Get Zealer‘, command=self.zealer_video).grid(row=0, column=0)

        Button(self.frame2, text=uGet Mydrivers, command=self.my_drivers).pack(expand=YES)
        # Button(self.win, text=u‘Get Mydrivers‘, command=self.my_drivers).grid(row=1, column=1)

    def my_drivers(self):
        print start get at:, ctime()
        self.listbox.delete(0, END)
        self.getm = mnews.GetMydrivers()
        proxyset = UseProxy()
        for l in self.getm.splitcontent(proxyset):
            s = str(l).decode(utf-8)
            try:
                self.listbox.insert(END, re.findall(r(?<=href=").+?(?=">), s)[0]+"\r\n")
                self.listbox.insert(END, re.findall(r(?<=>).+?(?=<), s)[0]+"\r\n")
                self.listbox.update()
            except IndexError:
                pass
        print get done at:, ctime()

    def zealer_video(self):
        print start get at:, ctime()
        self.listbox.delete(0, END)
        self.getz = soup.GetZealerVideo()
        proxyset = UseProxy()
        for l in self.getz.splitcontent(proxyset):
            self.listbox.insert(END, l+"\r\n")
            self.listbox.update()
        print get done at:, ctime()

    def get_select(self, ev=None):
        self.listbox.config(selectbackground=red)
        print self.listbox.curselection()
        self.check = self.listbox.get(self.listbox.curselection())
        if self.check:
            if re.match(http, self.check):
                os.startfile(self.check)

def main():
    d = GetResource()
    mainloop()

if __name__ == __main__:
    main()

 

Tkinter爬虫(Zealer、Mydrivers)--with Proxy

标签:

原文地址:http://www.cnblogs.com/guojian2080/p/4631822.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!