标签:break scrapy 封ip name 防止 features com print response
‘‘‘book_name = ‘jieqishu‘ # 爬虫名
book_url = ‘http://www.jieqishu.com‘ + ‘/‘ + book_name + ‘/‘ #拼接小说地址)
response = requests.get(url= book_url)
response.encoding = response.apparent_encoding #转码
soup = BeautifulSoup(response.text, features=‘html.parser‘)
a = soup.find(id=‘list‘)
dd_all = a.find_all(‘dd‘)
http_all = []
for i in dd_all:
http_all.append(book_url + i.find(‘a‘).attrs.get(‘href‘))
http_all = http_all[8:] #从开头开始截取都为7章
m = 5 #测试限定爬取次数
with open(book_name+‘.txt‘, ‘w‘) as f:
n = 0 #计数
for i in http_all:
if m==n:break
h = requests.get(url=i)
h.encoding = h.apparent_encoding
hb = BeautifulSoup(h.text, features=‘html.parser‘)
tar_t = hb.find(id=‘content‘)
tar_h = hb.find("h1").text
f.write(tar_h+‘\n‘)
for j in tar_t:
if str(j)!="<br/>":
f.write(str(j).lstrip()+‘\n‘)
time.sleep(random.randint(3, 6))#增加爬取时间间隔,防止被封ip
n+=1
f.write(‘\n\n‘)
print(‘第%d章写入完成!‘%n)
f.close()
标签:break scrapy 封ip name 防止 features com print response
原文地址:https://blog.51cto.com/12070874/2543132