码迷,mamicode.com
首页 > 编程语言 > 详细

python爬虫获取下一页

时间:2020-03-13 13:22:52      阅读:75      评论:0      收藏:0      [点我收藏+]

标签:browser   lxml   __name__   encode   crawl   load   pat   use   方式   

from time import sleep

import faker
import requests
from lxml import etree

fake = faker.Faker()

base_url = "http://angelimg.spbeen.com"

def get_next_link(url):
    content = downloadHtml(url)
    html = etree.HTML(content)
    next_url = html.xpath("//a[@class=‘ch next‘]/@href")
    if next_url:
        return base_url + next_url[0]
    else:
        return False

def downloadHtml(ur):
    user_agent = fake.user_agent()
    headers = {User-Agent: user_agent,"Referer":"http://angelimg.spbeen.com/"}
    response = requests.get(url, headers=headers)
    return response.text

def getImgUrl(content):
    html  = etree.HTML(content)
    img_url = html.xpath(//*[@id="content"]/a/img/@src)
    title = html.xpath(".//div[‘@class=article‘]/h2/text()")

    return img_url[0],title[0]

def saveImg(title,img_url):
    if img_url is not None and title is not None:
        with open("txt/"+str(title)+".jpg",wb) as f:
            user_agent = fake.user_agent()
            headers = {User-Agent: user_agent,"Referer":"http://angelimg.spbeen.com/"}
            content = requests.get(img_url, headers=headers)
            #request_view(content)
            f.write(content.content)
            f.close()

def request_view(response):
    import webbrowser
    request_url = response.url
    base_url = <head><base href="%s"> %(request_url)
    base_url = base_url.encode()
    content = response.content.replace(b"<head>",base_url)
    tem_html = open(tmp.html,wb)
    tem_html.write(content)
    tem_html.close()
    webbrowser.open_new_tab(tmp.html)

def crawl_img(url):
    content = downloadHtml(url)
    res = getImgUrl(content)
    title = res[1]
    img_url = res[0]
    saveImg(title,img_url)

if __name__ == "__main__":
    url = "http://angelimg.spbeen.com/ang/4968/1"

    while url:
        print(url)
        crawl_img(url)
        url = get_next_link(url)

还有种方式,获取到总页数,再循环 

python爬虫获取下一页

标签:browser   lxml   __name__   encode   crawl   load   pat   use   方式   

原文地址:https://www.cnblogs.com/php-linux/p/12485691.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!