使用requests + beautifulsoup 写一个简单的漫画爬虫

时间：2018-10-10 17:06:47 阅读：178 评论：0 收藏：0 [点我收藏+]

标签：menu .com sleep headers with open linux com lin 查找

from bs4 import BeautifulSoup
import requests
import os
from time import sleep

class get_img(object):
    def get_url(self):
        self.new_url = []
        url = "http://www.gumua.com/Manhua/28307.html"          #首页网址
        headers = {‘User-Agent‘: ‘Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100101 Firefox/22.0‘}
        res = requests.get(url=url,headers=headers)             #获取网页信息
        soup = BeautifulSoup(res.content,"lxml")                #通过bs4对网页进行解析
        divs =soup.find(class_="d_menu")                        #查找body
        lis = divs.select("li > a")                             #查找li下的a标签
        for i in lis:
            link = i.get("href")
            self.new_url.append("http://www.gumua.com"+link)   #获取漫画的所有网址
            num=i.string                                       #漫画集数
        self.new_url.reverse()                                 #倒叙输出所有地址
        return self.new_url
    def img(self):
        self.get_url()
        imgs = []
        headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36"}
        url = self.new_url
        for link in url:
            res = requests.get(url=link,headers=headers)        #获取所有网址的内容
            soup = BeautifulSoup(res.content,"lxml")            #通过bs4对网页进行解析
            #print(soup)
            body = soup.find("div",class_="r_img ")             #查找属性
            srcs = body.select("img")                           #查找图片
            for link in srcs:
                imgs.append(link.get("src"))                    #查找图片链接地址，追加到list中

        for i in range(len(imgs)):
            root = "/Users/caojialin/work2018/木乃伊新娘/"         #本地目录
            path = root + str(i)+".jpg"                          #路径和名称
            try:
                if not os.path.exists(root):
                    os.mkdir(root)
                if not os.path.exists(path):
                    r = requests.get(imgs[i])
                    r.raise_for_status()
                    sleep(0.1)
                    # 使用with语句可以不用自己手动关闭已经打开的文件流
                    with open(path, "wb") as f:  # 开始写文件，wb代表写二进制文件
                        f.write(r.content)
                    print("爬取完成")
                else:
                    print("文件已存在")
            except Exception as e:
                print("爬取失败:" + str(e))

标签：menu .com sleep headers with open linux com lin 查找

原文地址：https://www.cnblogs.com/paoye/p/9767194.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行