一个爬取52破解的全部帖子地址的简单爬虫

时间：2018-03-28 22:07:27 阅读：501 评论：0 收藏：0 [点我收藏+]
标签：rom input exception dict 样本 log 移动安全 title nbsp
 


  1 # -*- coding:utf-8 -*- 
  2 import requests
  3 from bs4 import BeautifulSoup
  4 import time
  5  
  6  
  7  
  8 title_list=[{‘原创发布区‘:‘http://www.52pojie.cn/forum-2-1.html‘},
  9                         {‘逆向资源区‘:‘http://www.52pojie.cn/forum-4-1.html‘},
 10                         {‘脱壳破解区‘:‘http://www.52pojie.cn/forum-5-1.html‘},
 11                         {‘动画发布区‘:‘http://www.52pojie.cn/forum-6-1.html‘},
 12                         {‘悬赏问答区‘:‘http://www.52pojie.cn/forum-8-1.html‘},
 13                         {‘水漫金山‘:‘http://www.52pojie.cn/forum-10-1.html‘},
 14                         {‘站点公告‘:‘http://www.52pojie.cn/forum-13-1.html‘},
 15                         {‘精品软件区‘:‘http://www.52pojie.cn/forum-16-1.html‘},
 16                         {‘音乐视频‘:‘http://www.52pojie.cn/forum-19-1.html‘},
 17                         {‘编程语言区‘:‘http://www.52pojie.cn/forum-24-1.html‘},
 18                         {‘申请专区‘:‘http://www.52pojie.cn/forum-25-1.html‘},
 19                         {‘LCG Area‘:‘http://www.52pojie.cn/forum-28-1.html‘},
 20                         {‘病毒分析区‘:‘http://www.52pojie.cn/forum-32-1.html‘},
 21                         {‘周年庆典活动专区‘:‘https://www.52pojie.cn/forum-36-1.html‘},
 22                         {‘招聘求职‘:‘http://www.52pojie.cn/forum-39-1.html‘},
 23                         {‘病毒样本区‘:‘http://www.52pojie.cn/forum-40-1.html‘},
 24                         {‘安全工具区‘:‘http://www.52pojie.cn/forum-41-1.html‘},
 25                         {‘电子书策划制作区‘:‘http://www.52pojie.cn/forum-42-1.html‘},
 26                         {‘Key|Patch|共享账号‘:‘http://www.52pojie.cn/forum-44-1.html‘},
 27                         {‘病毒救援区‘:‘http://www.52pojie.cn/forum-50-1.html‘},
 28                         {‘影视推荐‘:‘http://www.52pojie.cn/forum-56-1.html‘},
 29                         {‘LSG Area‘:‘http://www.52pojie.cn/forum-58-1.html‘},
 30                         {‘软件调试区‘:‘http://www.52pojie.cn/forum-59-1.html‘},
 31                         {‘T恤活动作品区‘:‘http://www.52pojie.cn/forum-62-1.html‘},
 32                         {‘移动安全区‘:‘http://www.52pojie.cn/forum-65-1.html‘},
 33                         {‘福利经验‘:‘http://www.52pojie.cn/forum-66-1.html‘},
 34                         {‘2014CrackMe大赛‘:‘http://www.52pojie.cn/forum-67-1.html‘},
 35                         {‘吾爱破解2016安全挑战赛‘:‘http://www.52pojie.cn/forum-71-1.html‘},
 36                         {‘站务处理‘:‘http://www.52pojie.cn/forum-72-1.html‘}]
 37  
 38  
 39  
 40  
 41  
 42  
 43 def get_html(url):
 44         while True:
 45                 try:
 46                         response = requests.get(url)
 47                         return response.text
 48                 except Exception as e:
 49                         time.sleep(10)
 50                         continue
 51  
 52  
 53 # 得到区域总页数        
 54 def get_page(url):
 55         html = get_html(url)
 56         soup = BeautifulSoup(html,‘lxml‘)
 57         label_list =soup.find_all(‘label‘)
 58         page = int(label_list[3].span.string[3:-2])
 59         return page
 60          
 61 # 下载指定页面
 62 def page_down(url):
 63  
 64         page = get_page(url)
 65         print("总页数："+str(page))
 66         txt = input("请输入保存到的文件名(注意添加后缀):")
 67         for j in range(1,page+1):
 68                 print(("第"+str(j)+"页下载中").center(40,"■"))
 69                 html = get_html(url[:-7]+‘-‘+str(j)+‘.html‘)        
 70                 soup = BeautifulSoup(html,‘lxml‘)
 71                 label_list =soup.find_all(‘label‘)
 72                 a_list =soup.find_all(‘a‘,attrs={‘class‘:‘s xst‘})
 73                 #写入到文件
 74                 for a in a_list:
 75                         #print(a.string)
 76                         #print("https://www.52pojie.cn/"+a.attrs[‘href‘])
 77                         with open(txt,‘a+‘,encoding=‘utf-8‘) as f:
 78                                 f.write(a.get_text())
 79                                 f.write(‘\n‘)
 80                                 f.write("https://www.52pojie.cn/"+a.attrs[‘href‘])
 81                                 f.write(‘\n‘)
 82          
 83                 print(("第"+str(j)+"页下载完成").center(40,"■"))
 84  
 85 def main():
 86         i = 0
 87         time = 0
 88         url = ‘‘
 89         # 输出列表
 90         for title in title_list:
 91                 #print(title)
 92                 for key in title:
 93                         url = str(title[key])
 94                         if time==1:
 95                                 print((str(i)+‘:‘+key).ljust(20))
 96                                 time=0
 97                                  
 98                         else:
 99                                 print((str(i)+‘:‘+key).ljust(20),end=" ")
100                                 time+=1
101                 i+=1
102          
103         # 判断输入是否在范围内
104         while True:
105                 try:
106                         print()
107                         num = int(input(‘请输入你要浏览的代号：‘))
108                         if num>28 or num<0:
109                                 print(‘输入有误请重新输入‘)
110                                 continue
111                         else:
112                                 break
113                 except Exception as e:
114                         print(‘输入有误请重新输入‘)
115                         continue
116         # 获得区域链接
117         dict_t = title_list[num]
118         for key in dict_t:
119                 print(dict_t[key])
120                 page_down(dict_t[key])
121                  
122 if __name__ == ‘__main__‘:
123         main()
标签：rom input exception dict 样本 log 移动安全 title nbsp
原文地址：https://www.cnblogs.com/zbuter/p/8666350.html
踩
(0)
评论一句话评论（0）
分享档案
更多>
2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)
周排行