爬虫 requests 和 beautiful soup 提取内容

时间：2020-02-10 12:03:57 阅读：77 评论：0 收藏：0 [点我收藏+]

标签：requests import get request string title 写入 except beautiful

import requests
import time
from bs4 import BeautifulSoup

class getContents():
    # 获取html页面
    def getHTMLText(self, url):
        try:
            kv = {‘user-agent‘: ‘Mozilla/5.0‘}
            r = requests.get(url, headers=kv)
            r.raise_for_status()
            r.encoding = r.apparent_encoding
            return r.text
        except:
            return ""
    # 获取标题
    def getTitle(self, str):
        try:
            tag = BeautifulSoup(str, ‘html.parser‘)
            return tag.h3.string
        except:
            return ""
    # 获取内容
    def getContent(self, str):
        try:
            soup = BeautifulSoup(str, ‘html.parser‘)
            p = soup.find_all(‘span‘)
            if p.__len__() == 0:
                p = soup.find_all(‘font‘)
                if len(p) == 0:
                    p = soup.find_all(‘div‘)
            print(len(p))
            s = ""
            for i in p:
                if i.string == None:
                    continue
                s = s + i.text
            print(s)
            return s
        except:
            return "1"
    # 写入内容
    def write(self, str, filename):
        try:
            filename = filename + ‘.txt‘
            with open(filename, "w", encoding="utf-8") as f:
                f.write(str)
                print("成功")
        except:
            print("错误")


def main():
    with open("urlneimenggu.txt", ‘r‘, encoding="utf-8") as f:
        url = f.read().split(‘\n‘)
        address = getContents()
        for i in url:
            print(i)
            html = address.getHTMLText(i)
            while(True):
                if html == "":
                    print("等待中....")
                    time.sleep(5)
                    html = address.getHTMLText(i)
                else:
                    break
            title = address.getTitle(html)
            content = address.getContent(html)
            address.write(content, title)

main()

标签：requests import get request string title 写入 except beautiful

原文地址：https://www.cnblogs.com/acthis/p/12290194.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行