python爬虫获取下一页

时间：2020-03-13 13:22:52 阅读：75 评论：0 收藏：0 [点我收藏+]

标签：browser lxml __name__ encode crawl load pat use 方式

from time import sleep

import faker
import requests
from lxml import etree

fake = faker.Faker()

base_url = "http://angelimg.spbeen.com"

def get_next_link(url):
    content = downloadHtml(url)
    html = etree.HTML(content)
    next_url = html.xpath("//a[@class=‘ch next‘]/@href")
    if next_url:
        return base_url + next_url[0]
    else:
        return False

def downloadHtml(ur):
    user_agent = fake.user_agent()
    headers = {‘User-Agent‘: user_agent,"Referer":"http://angelimg.spbeen.com/"}
    response = requests.get(url, headers=headers)
    return response.text

def getImgUrl(content):
    html  = etree.HTML(content)
    img_url = html.xpath(‘//*[@id="content"]/a/img/@src‘)
    title = html.xpath(".//div[‘@class=article‘]/h2/text()")

    return img_url[0],title[0]

def saveImg(title,img_url):
    if img_url is not None and title is not None:
        with open("txt/"+str(title)+".jpg",‘wb‘) as f:
            user_agent = fake.user_agent()
            headers = {‘User-Agent‘: user_agent,"Referer":"http://angelimg.spbeen.com/"}
            content = requests.get(img_url, headers=headers)
            #request_view(content)
            f.write(content.content)
            f.close()

def request_view(response):
    import webbrowser
    request_url = response.url
    base_url = ‘<head><base href="%s">‘ %(request_url)
    base_url = base_url.encode()
    content = response.content.replace(b"<head>",base_url)
    tem_html = open(‘tmp.html‘,‘wb‘)
    tem_html.write(content)
    tem_html.close()
    webbrowser.open_new_tab(‘tmp.html‘)

def crawl_img(url):
    content = downloadHtml(url)
    res = getImgUrl(content)
    title = res[1]
    img_url = res[0]
    saveImg(title,img_url)

if __name__ == "__main__":
    url = "http://angelimg.spbeen.com/ang/4968/1"

    while url:
        print(url)
        crawl_img(url)
        url = get_next_link(url)

还有种方式，获取到总页数，再循环

python爬虫获取下一页

标签：browser lxml __name__ encode crawl load pat use 方式

原文地址：https://www.cnblogs.com/php-linux/p/12485691.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行