标签:mozilla 百度贴吧 trie ide art img nbsp spider rtp
import urllib.request import urllib.parse import ssl ssl._create_default_https_context = ssl._create_unverified_context from lxml import etree def loadPage(url): headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1 Trident/5.0;"} request = urllib.request.Request(url, headers = headers) html = urllib.request.urlopen(request).read() #解析HTML文档为HTML_DOM模型 content = etree.HTML(html) #返回所有匹配成功的列表集合 link_list = content.xpath(‘//li[@class=" j_thread_list clearfix"]//div[@class="threadlist_title pull_left j_th_tit "]/a/@href‘) print(link_list,len(link_list)) for link in link_list: fulllink = "http://tieba.baidu.com" + link #每个帖子的链接 loadImage(fulllink) #取出每个帖子中每个图片的的链接 def loadImage(link): headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1 Trident/5.0;"} request = urllib.request.Request(link, headers=headers) html = urllib.request.urlopen(request).read() content = etree.HTML(html) #返回帖子里所有图片链接的列表集合 link_list = content.xpath(‘//img[@class="BDE_Image"]/@src‘) for link in link_list: filename = link[-15:] urllib.request.urlretrieve(link,‘./tieba/‘+filename) print("下载成功"+‘----‘+filename) def tiebaSpider(url, beginPage, endPage): for page in range(beginPage, endPage + 1): pn = (page - 1) * 50 fullurl = url + "&pn=" + str(pn) loadPage(fullurl) if __name__ == "__main__": kw = input("请输入要爬取的贴吧名:") startPage = int(input("请输入起始页:")) endPage = int(input("请输入结束页:")) url = "https://tieba.baidu.com/f?" # 可以使用urlencode({‘kw‘:kw}) ---> https://tieba.baidu.com/f?kw=美女 key = urllib.parse.urlencode({"kw": kw}) fullurl = url + key # fullurl = url + ‘kw=‘ +kw # print(fullurl) tiebaSpider(fullurl, startPage, endPage)
标签:mozilla 百度贴吧 trie ide art img nbsp spider rtp
原文地址:https://www.cnblogs.com/dongpei/p/9404640.html