标签:爬虫
#coding:utf-8
import requests
import random
class TiebaSpider:
def __init__(self,tieba_name):
self.headers = {‘User-Agent‘:‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36‘}
self.tieba_name = tieba_name
self.url_temp = "https://tieba.baidu.com/f?kw="+tieba_name+"&ie=utf-8&pn={}"
def get_url_list(self):
url_list = [self.url_temp.format(i*50) for i in range(0,30)]
return url_list
def parse_url(self,url):
print(‘正在请求%s‘ % url)
res = requests.get(url,headers = self.headers)
return res.content.decode()
def save_html_str(html_str,page_num):
print(‘正在保存第%s页.html‘ % page_num)
file_name = str(page_num)+‘.html‘
with open(file_name,‘w‘) as f:
f.write(html_str)
print(‘保存%s成功‘ % file_name)
def run(self):
#1.实现主要逻辑
url_list = self.get_url_list()
#2.遍历列表,发送请求,获取响应
for url in url_list:
html_str = self.parse_url(url)
#3.保存
page_num = url_list.index(url) + 1
self.save_html_str(html_str,page_num)
if __name__ == "__main__":
tieba_name = input(‘请输入要贴吧名:‘)
tieba = TiebaSpider(tieba_name)
tieba.run()
本文出自 “梦女孩” 博客,请务必保留此出处http://dreamgirl1314.blog.51cto.com/1159474/1981063
标签:爬虫
原文地址:http://dreamgirl1314.blog.51cto.com/1159474/1981063