标签:
1 import httplib 2 from HTMLParser import HTMLParser 3 import urlparse 4 import urllib 5 from bs4 import BeautifulSoup 6 import re 7 from time import sleep 8 ‘‘‘ 9 usage: 10 11 12 input your wooyun cookies,then just go crawl!! 13 14 author: Elliott 15 16 17 ‘‘‘ 18 19 20 21 22 domain = ‘wooyun.org‘ 23 cookies = ‘‘ # !!!!here input your wooyun cookies 24 user_agent = ‘Mozilla/5.0 (Windows NT 6.1; WOW64; rv:29.0) Gecko/20100101 Firefox/29.0‘ 25 26 27 28 def countnumber(): # the function to get page num 29 global domain 30 global cookies 31 global user_agent 32 conn = httplib.HTTPConnection(domain) 33 conn.request(‘GET‘,‘/user.php?action=openbugs‘,‘‘,{‘Cookie‘:cookies,‘User-Agent‘:user_agent,‘Referer‘:‘http://wooyun.org/index.php‘,‘Host‘:‘wooyun.org‘}) 34 content = conn.getresponse() 35 content = content.read() 36 soup = BeautifulSoup(content) 37 tag = soup.find_all(‘p‘,attrs={‘class‘:‘page‘}) 38 if len(tag) == 0: 39 tag = ‘None‘ 40 else: 41 tag = str(tag[0]) 42 pattern = re.compile(‘>.*<a class=\"current\">‘) 43 result = pattern.findall(tag) 44 if len(result) == 0: 45 result = ‘None‘ 46 else: 47 result = str(result[0]) 48 number = filter(str.isdigit, result) 49 num = number[5:] #get then total page number 50 return int(num) 51 52 53 def dealthepage(content): 54 global domain 55 global cookies 56 global user_agent 57 conn = httplib.HTTPConnection(domain) 58 soup = BeautifulSoup(content) 59 k = soup.find_all(‘a‘) 60 item = k[27:47] 61 pattern = re.compile(‘href=\"(.+?)\"‘) 62 hreaf = [] 63 for i in range(len(item)): 64 ss = pattern.findall(str(item[i])) 65 if len(ss) == 0: 66 break 67 hreaf.append(str(ss[0])) 68 for i in hreaf: 69 #sleep(0.5) 70 conn.request(‘GET‘,i,‘‘,{‘Cookie‘:cookies,‘User-Agent‘:user_agent,‘Referer‘:‘http://wooyun.org/index.php‘,‘Host‘:‘wooyun.org‘}) 71 content2 = conn.getresponse() 72 content2 = content2.read() 73 soup2 = BeautifulSoup(content2) 74 imgtag = soup2.find_all(class_=‘credit‘) 75 ‘‘‘may be $ or cloud‘‘‘ 76 if len(imgtag) != 0: 77 findcloud = re.compile(‘src=\"\/images\/credit\.png\"‘) 78 findmoney = re.compile(‘src=\"\/images\/m(.?)\.png\"‘) 79 cloudnum = findcloud.findall(content2) 80 moneylevel = findmoney.findall(content2) 81 cloud = 0 82 money = 0 83 if len(cloudnum) != 0: 84 if len(cloudnum) == 1: 85 cloud = 1 86 if len(cloudnum) == 2: 87 cloud = 2 88 if len(cloudnum) == 3: 89 cloud = 3 90 if len(moneylevel) != 0: 91 if len(moneylevel) == 1: 92 money = 1 93 if len(moneylevel) == 2: 94 money = 2 95 if len(moneylevel) == 3: 96 money = 3 97 title = soup2.findAll(attrs={"class":"wybug_title"}) 98 if len(title) == 0: 99 title = ‘No Title‘ 100 else: 101 title = str(title[0]) 102 deltag = re.compile(‘r<[^>]+>‘) 103 title = deltag.sub(‘‘,title) 104 author = soup2.findAll(attrs={"class":"wybug_author"}) 105 if len(author) == 0: 106 author = ‘No name‘ 107 else: 108 author = str(author[0]) 109 author = deltag.sub(‘‘,author) 110 date = soup2.findAll(attrs={"class":"wybug_date"}) 111 if len(date) == 0: 112 date = ‘No time‘ 113 else: 114 date = str(date[0]) 115 date = deltag.sub(‘‘,date) 116 link = "http://www.wooyun.org"+i 117 link = str(link) 118 f = open("test.html","a+") 119 s = "<tr><td>level:cloud="+str(cloud)+"money="+str(money)+"</td><th>"+date+"</th><td><a href=‘"+link+"‘>"+title+"</a></td><th>"+author+"</th></tr><br>" 120 f.write(s) 121 f.close 122 123 124 125 126 127 128 if __name__ == ‘__main__‘: 129 num = countnumber() #get page num 130 for i in range(num): 131 conn = httplib.HTTPConnection(domain) 132 conn.request(‘GET‘,‘/user.php?action=openbugs&pNO=‘+str(i+1),‘‘,{‘Cookie‘:cookies,‘User-Agent‘:user_agent,‘Referer‘:‘http://wooyun.org/index.php‘,‘Host‘:‘wooyun.org‘}) 133 content = conn.getresponse() 134 content = content.read() 135 dealthepage(content) 136 137 138 139 140 141
附章效果图:
标签:
原文地址:http://www.cnblogs.com/elliottc/p/4992700.html