标签:rip cti 段子 http none lib sleep mozilla gen
# python 3.7 from urllib.request import Request,urlopen import re,time class Neihan(object): def __init__(self): self.header={ ‘Host‘: ‘www.neihan8.com‘, ‘Referer‘: ‘https: // www.neihan8.com / njjzw //‘, ‘Upgrade - Insecure - Requests‘: 1, ‘User - Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36‘, "Cookie": ‘UM_distinctid=1673e837ae7146-0363c5477e0b8a-424f0928-13c680-1673e837ae9355; CNZZDATA1274349754=965294396-1542939999-%7C1542939999; Hm_lvt_94f4eb93f17efa632a5c8a01b23da410=1542942067; npreuecookieclassrecord=%2C2%2C14%2C1%2C; CNZZDATA5804950=cnzz_eid%3D222162018-1542942068-https%253A%252F%252Fwww.neihan8.com%252F%26ntime%3D1542942068; Hm_lpvt_94f4eb93f17efa632a5c8a01b23da410=1542943190‘ } self.static = ‘https://www.neihan8.com/njjzw/‘ def getPage(self,url,refer=None): res = urlopen(Request(url=url,headers=self.header)).read() self.parsePage(res.decode(),refer) def parsePage(self,htmlres,*args): patten = ‘class="title" title=".*?">(.*?)</a></h3>\s+<div class="desc">(.*?)</div>‘ p = re.findall(patten,htmlres) self.writePge(p,args) def writePge(self,p,*args): with open(‘11.txt‘,‘a+‘,encoding=‘utf8‘) as f: print(args) for i in p: if args[0][0] is not None: print(args) f.write(‘问题:‘+i[0]+‘\n‘+args[0][0]+i[1].strip()+‘\n‘) else: f.write(‘问题:‘+i[0]+‘\n‘+i[1].strip()+‘\n‘) f.write(‘\n‘) def workon(self): # 爬取 20 页 for i in range(1,10): if i == 1: url = self.static self.getPage(url, refer=‘答案:‘) else: url = self.static+‘index_%s.html‘%i self.getPage(url) time.sleep(2) if __name__ == ‘__main__‘: spider = Neihan() spider.workon()
标签:rip cti 段子 http none lib sleep mozilla gen
原文地址:https://www.cnblogs.com/Skyda/p/10006672.html