爬取古诗文网古诗词

时间：2020-01-30 23:05:04 阅读：255 评论：0 收藏：0 [点我收藏+]

#python3.6
#爬取古诗文网的诗文

import requests
from bs4 import BeautifulSoup
import html5lib
import re
import os

def content(soup):
    b = 1
    poetrydict = dict()
    for i in soup.find_all(‘a‘)[8:]:
        if i.get(‘href‘):
            url = ‘%s%s‘ % ("https://so.gushiwen.org/",i.get(‘href‘))
            if (i.get(‘href‘) == "/gushi/tangshi.aspx" or i.get(‘href‘) =="/gushi/xiaowen.aspx"):
                # 到唐诗三百或小学文言即结束
                break
        else:
            url=i.get(‘href‘)
        sbody = re.sub(u"\\（.*?\\）", "", str(i.text))
        # 抽取题目（汉字）去掉题目中的括号以及括号中的内容
        # sbody=re.sub("\\·","",sbody).strip()
        # 去掉题目中的"·"符号
        print(b, sbody,url)
        poetrydict[sbody]=url
        b=b+1

    return poetrydict
#返回一个键为诗的题目，值为诗文链接的字典

def fulltext(pdict):
    c=b=d=0
    poetrydict=dict()
    for i in pdict:
        if pdict[i]:
            s= requests.get(pdict[i])
            soup= BeautifulSoup(s.text, ‘html5lib‘)
            text2=soup.find_all(‘div‘)[9]
            poetry = str(text2.find_all(‘div‘)[4].text)
            poetry = re.sub(‘\s+‘, ‘‘, poetry).strip()
            #替换调空白(空格、换行）
            poetry = re.sub(u"\\(.*?\\)", "", poetry)
            #去掉括号括起来的通假字还有括号
            poetrydict[i]=formattext(poetry)
        else:
            poetrydict[i]=None
    return poetrydict
# 将诗文和题目存进一个字典中，并将此字典作为函数的返回

def formattext(s):
    s = re.sub(‘\\。‘, ‘。\n‘, s)
    s = re.sub(‘\\！‘, ‘！\n‘, s)
    s = re.sub(‘\\？‘, ‘？\n‘, s)
    s = re.sub(‘\\：‘, ‘：\n‘, s)
    s = re.sub(‘\\；‘, ‘：\n‘, s)
    s = re.sub(‘\\「‘,‘‘, s)
    s = re.sub(‘\\」‘, ‘‘, s)
    s = re.sub(‘\\《‘, ‘‘, s)
    s = re.sub(‘\\》‘, ‘‘, s).strip()
    return s
# 对输出诗文做一些格式上的调整,还有待完善

def output(dict,text):
    for i in dict:
        # print(dict[i])
        print("####", i, "\n", dict[i], file=text)
        print( i, "\n", dict[i])
    text.close()

if __name__ == ‘__main__‘:
    strc = input("请输入(类别)链接：")
    ## 如，https://so.gushiwen.org/gushi/tangshi.aspx
    #网页右侧边栏
    sc = input("请输入文件名：")
    sw = requests.get(strc)
    soup = BeautifulSoup(sw.text, ‘html5lib‘)
    s=‘D:\\pythonPROJECT\\‘+sc+".md"
    # 存储目录
    file = open(s, "w", encoding=‘utf-8‘)
    output(fulltext(content(soup)),file)

爬取古诗文网古诗词

标签：att ext2 调整 mat 古诗词 tput rip 存储目录文件

原文地址：https://www.cnblogs.com/loeFairy/p/12244110.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行