码迷,mamicode.com
首页 > 其他好文 > 详细

爬虫 requests 和 beautiful soup 提取内容

时间:2020-02-10 12:03:57      阅读:77      评论:0      收藏:0      [点我收藏+]

标签:requests   import   get   request   string   title   写入   except   beautiful   

import requests
import time
from bs4 import BeautifulSoup

class getContents():
# 获取html页面
def getHTMLText(self, url):
try:
kv = {‘user-agent‘: ‘Mozilla/5.0‘}
r = requests.get(url, headers=kv)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ""
# 获取标题
def getTitle(self, str):
try:
tag = BeautifulSoup(str, ‘html.parser‘)
return tag.h3.string
except:
return ""
# 获取内容
def getContent(self, str):
try:
soup = BeautifulSoup(str, ‘html.parser‘)
p = soup.find_all(‘span‘)
if p.__len__() == 0:
p = soup.find_all(‘font‘)
if len(p) == 0:
p = soup.find_all(‘div‘)
print(len(p))
s = ""
for i in p:
if i.string == None:
continue
s = s + i.text
print(s)
return s
except:
return "1"
# 写入内容
def write(self, str, filename):
try:
filename = filename + ‘.txt‘
with open(filename, "w", encoding="utf-8") as f:
f.write(str)
print("成功")
except:
print("错误")


def main():
with open("urlneimenggu.txt", ‘r‘, encoding="utf-8") as f:
url = f.read().split(‘\n‘)
address = getContents()
for i in url:
print(i)
html = address.getHTMLText(i)
while(True):
if html == "":
print("等待中....")
time.sleep(5)
html = address.getHTMLText(i)
else:
break
title = address.getTitle(html)
content = address.getContent(html)
address.write(content, title)

main()

爬虫 requests 和 beautiful soup 提取内容

标签:requests   import   get   request   string   title   写入   except   beautiful   

原文地址:https://www.cnblogs.com/acthis/p/12290194.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!