python小程序获取url

时间：2016-04-12 20:52:45 阅读：153 评论：0 收藏：0 [点我收藏+]

标签：

获取中国比较有权重的网站

# encoding=utf-8
import re
import requests
from bs4 import BeautifulSoup

class getUrl(object):

    def __init__(self,num):
        self.totle = num
        self.myheader = {‘Host‘: ‘top.chinaz.com‘,
                         ‘Connection‘: ‘ keep-alive‘,
                    ‘User-Agent‘:‘Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36‘,
                    ‘Accept‘:‘*/*‘,‘Referer‘:‘http://www.chinaz.com/‘,
                    ‘Accept-Encoding‘:‘gzip, deflate, sdch‘,‘Accept-Language‘:‘zh-CN,zh;q=0.8‘}  # 表头信息

    def beginer(self):
        print ‘get start‘
        page = 2
        urlliset = []
        while page < 1680:
            url = ‘http://top.chinaz.com/all/index_‘+str(page)+‘.html‘
            r = requests.get(url,headers=self.myheader)
            soup = BeautifulSoup(r.text)
            list = soup.select(‘.col-gray‘)
            site = re.findall(‘<span.*?>(.*?)</span>‘,str(list))
            del site[0]
            for elem in site:
                urlliset.append(elem)
            page += 1
        self.writeQQ(text = urlliset,file_dir=‘site.text‘,mode=‘w‘)

    def writeQQ(self,text, file_dir, mode):
        with open(file_dir, mode) as f:
            for site in text:
                f.write(site)
                f.write("\n")




spidre = getUrl(44)
spidre.beginer()

python小程序获取url

标签：

原文地址：http://www.cnblogs.com/zxcx/p/5384340.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行

python小程序 获取url

python小程序获取url