码迷,mamicode.com
首页 > 编程语言 > 详细

python 爬取百度url

时间:2017-08-29 21:49:15      阅读:233      评论:0      收藏:0      [点我收藏+]

标签:ide   []   [1]   https   safari   spi   /usr   exception   art   

 1 #!/usr/bin/env python
 2 # -*- coding: utf-8 -*-
 3 # @Date    : 2017-08-29 18:38:23
 4 # @Author  : EnderZhou (zptxwd@gmail.com)
 5 # @Link    : http://www.cnblogs.com/enderzhou/
 6 # @Version : $Id$
 7 
 8 import requests
 9 import sys
10 from Queue import Queue
11 import threading
12 from bs4 import BeautifulSoup as bs
13 import re
14 
15 # 默认爬取百度76页搜索结果url,调用格式 Python.exe 本文件名称.py 搜索关键字,如关键字含特殊符号使用引号包含起来。
16 # 爬取结果有txt文档输出。目前尚未能过来百度推广链接,后续有可能会完善。另外后续将会添加同一网站相同路径不通参数url的过滤。
17 # https://www.baidu.com/s?wd=ichunqiu&pn=10
18 # wd参数为搜索内容关键字 pn参数控制页码 第二页为10 每页新增10 最大页数参数为750即76页。
19 
20 headers = {User-Agent:Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.78 Safari/537.36,}
21 
22 class BaiduSpider(threading.Thread):
23     def __init__(self,queue):
24         threading.Thread.__init__(self)
25         self._queue = queue
26 
27     def run(self):
28         while not self._queue.empty():
29              url = self._queue.get()
30              try:
31                  self.spider(url)
32              except Exception as e:
33                  # print e
34                  pass
35         
36     def spider(self,url):
37         r = requests.get(url=url,headers=headers)
38         soup = bs(r.content,html.parser)
39         urllist = soup.find_all(name=a,attrs={data-click:re.compile((.)),class:None,data-is-main-url:None})
40         for i in urllist:
41             l = requests.get(url=i[href],headers=headers)
42             if l.status_code == 200:
43                 ll = l.url.split(/)
44                 lll = ll[0]+//+ll[2]+\n
45                 #可根据需求修改是否显示主域名
46                 sys.stdout.write(lll+l.url+\n)
47                 f1 = open(out_para.txt,a+)
48                 f1.write(l.url+\n)
49                 f1.close()
50                 with open(out_index.txt) as f:
51                     if lll not in f.read():
52                         f2 = open(out_index.txt,a+)
53                         f2.write(lll)
54                         f2.close()
55 
56 def main(keyword):
57     queue = Queue()
58     for i in range(0,760,10):
59         l = https://www.baidu.com/s?wd=+keyword+&pn=+str(i)
60         # print l
61         queue.put(l)
62     threads = []
63     thread_count = 5
64     for i in range(thread_count):
65         threads.append(BaiduSpider(queue))
66     for t in threads:
67         t.start()
68     for t in threads:
69         t.join()
70 
71 if __name__ == __main__:
72     if len(sys.argv) != 2:
73         print Enter:python %s keyword % sys.argv[0]
74         sys.exit(-1)
75     else:
76         f1 = open(out_para.txt,w)
77         f1.close()
78         f2 = open(out_index.txt,w)
79         f2.close()
80         main(sys.argv[1])

 

python 爬取百度url

标签:ide   []   [1]   https   safari   spi   /usr   exception   art   

原文地址:http://www.cnblogs.com/enderzhou/p/7450223.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!