标签:lxml imp select import add 格式化 result __str__ code
from bs4 import BeautifulSoup
import requests
url = ‘http://dangjian.gmw.cn/node_11940.htm‘
html = requests.get(url).content
# prettify()用于格式化
soup = BeautifulSoup(html, ‘lxml‘)
# print(soup.prettify())
# print(soup.find_all(‘span‘, class_="channel-newsTime"))
resultSet = soup.find_all(‘ul‘, class_="channel-newsGroup")
urls = set()
for rs in resultSet:
# url = rs.a[‘href‘]
hrefs = rs.find_all(‘a‘)
for href in hrefs:
url = href[‘href‘]
if url.startswith("http"):
urls.add(url)
else:
urls.add("http://dangjian.gmw.cn/"+url)
print(urls)
for url in urls:
html = requests.get(url).content
soup = BeautifulSoup(html, ‘lxml‘)
title = soup.find(id="articleTitle").string
# parts = soup.find(id="contentMain")
parts = soup.select("div #contentMain > p")
content = ""
for part in parts:
content = content + part.string.__str__()
print(title)
print(content)
标签:lxml imp select import add 格式化 result __str__ code
原文地址:https://www.cnblogs.com/cord/p/9452950.html