码迷,mamicode.com
首页 > 其他好文 > 详细

百度贴吧获取某个吧的全部图片

时间:2018-05-24 23:03:36      阅读:282      评论:0      收藏:0      [点我收藏+]

标签:tree   real   tle   ble   com   dev   性格   使用   init   

"""
搜索百度贴吧单个贴吧内的所有帖子
使用xpath定位
完成翻页功能
下载详情页中的所有图片
"""
import re

import os
import requests
import time
from lxml import etree

# 下一页的xpath //*[@id="frs_list_pager"]/a[contains(text(),‘下一页‘)]/@href

# 正则改写:<a rel="noreferrer" href="/p/5564366573" title="新人求指导,这三家店正规么?万分感谢!" target="_blank" class="j_th_tit ">新人求指导,这三家店正规么?万分感谢!</a>
# <a rel="noreferrer" href="/p/5532141331" title="都说布偶猫性格好 大家有被布偶伸爪子抓过吗?" target="_blank" class="j_th_tit ">都说布偶猫性格好 大家有被布偶伸爪子抓过吗?</a>

class CatBa(object):

def __init__(self,name):
self.name = name
self.url = ‘http://tieba.baidu.com/f?kw={}‘.format(name)
self.headers = {
"User-Agent": "Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0) "
# ‘Cookie‘:‘BIDUPSID=DDE115B05C2CA4276EA17E431514BC87; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BAIDUID=099C3C88481C218EE87AE81CC9711A7F:FG=1; PSTM=1519472311; H_PS_PSSID=1459_21094_22157; BDSFRCVID=OJKsJeCCxG39OkRAuJwbDkmyQoWuOCigBOma3J; H_BDCLCKID_SF=tJAj_D-btK03fP36qR6sMJ8thmT22-usB2QiQhcH0hOWsI8wXPRCLJLp5U5Ra-r8LCOULl7n2R3I8n6pDUC0-nDSHHKjJT-t3J; TIEBA_USERTYPE=8cff9d7553785bf414135144; TIEBAUID=cb23caae14130a0d384a57f1; bottleBubble=1; wise_device=0; bdshare_firstime=1519475463290; Hm_lvt_287705c8d9e2073d13275b18dbd746dc=1519475468; FP_UID=69c334ac70ace4caaeac308f190bb61d; Hm_lpvt_287705c8d9e2073d13275b18dbd746dc=1519475820; Hm_lvt_98b9d8c2fd6608d564bf2ac2ae642948=1519474819,1519475411,1519475637,1519476450; PSINO=2; Hm_lpvt_98b9d8c2fd6608d564bf2ac2ae642948=1519479464‘
}


def get_page(self,url):
response = requests.get(url,headers=self.headers)
# 清洗一下注释掉的内容

return response.content

def parse_data(self,data):

"""
1.通过正则收集页面上所有的详情URL,存入列表中
2.收集完本页的url后,用xpath寻找下一页
3.如果存在则遍历下一页,如果不存在则开始匹配图片
:return:
"""
# <a rel="noreferrer" href="/p/5564366573" title="新人求指导,这三家店正规么?万分感谢!" target="_blank" class="j_th_tit ">新人求指导,这三家店正规么?万分感谢!</a>
# temp_list = re.findall(r‘<a rel="noreferrer" href="//p//(/d+)" title=".*?" target="_blank" class="j_th_tit ">.*?<//a>‘, data,re.DOTALL)
# 生成element对象
url_list = []
html = etree.HTML(data)
temp_list = html.xpath(‘//*[@id="thread_list"]/li[@class=" j_thread_list clearfix"]/div/div[2]/div[1]/div[1]/a/@href‘)
url_list.extend(temp_list)
time.sleep(5)
# 调用xpath方法
# 匹配不出来的xpath://*[@id="frs_list_pager"]/a[contains(text(),"下一页")]/@href
try:
next_url = html.xpath(‘//*[@id="frs_list_pager"]/a[contains(text(),"下一页")]/@href‘)[0]
except Exception as e:
return None,url_list
return next_url,url_list

def get_pic_url(self,data):
html = etree.HTML(data)
pic_url_list = html.xpath(‘//div[contains(@id,"post_content_")]/img/@src‘)
return pic_url_list

def open_pic(self,url):
try:
response = requests.get(url,headers=self.headers)
except Exception as e:
print(‘打开图片出错‘)
return None

return response.content


def download_pic(self,pic_url_list):
if not os.path.exists(self.name):
os.mkdir(self.name)

for each_url in pic_url_list:
if ‘emoticon‘ not in each_url:
name = self.name + os.sep + each_url.split(‘/‘)[-1]
pic_content = self.open_pic(each_url)
try:
with open(name,‘wb‘) as f:
f.write(pic_content)
except Exception as e:
pass

def run(self):
while True:
data = self.get_page(self.url)
self.url,url_list = self.parse_data(data)
print(self.url)
print(url_list)
for each_url in url_list:
real_url = ‘http://tieba.baidu.com‘ + each_url
html = self.get_page(real_url)
pic_url_list = self.get_pic_url(html)
self.download_pic(pic_url_list)

if not self.url:
break
else:
self.url = ‘http:‘ + self.url


if __name__ == ‘__main__‘:
cat = CatBa(‘李毅‘)

#此处修改贴吧名字即可
cat.run()

百度贴吧获取某个吧的全部图片

标签:tree   real   tle   ble   com   dev   性格   使用   init   

原文地址:https://www.cnblogs.com/guducp/p/9085386.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!