标签:replace urllib pre page 解码 比较 int 绝对路径 htm
#时间 2019年3月4日19:16:06
#功能:爬取笔趣阁任何小说。 from urllib import request from bs4 import BeautifulSoup
#此函数用来获取每章对应的小说,并保存小说 def secondOpenURL(url,ch_name): # 请求每章详细内容 date = request.urlopen(url).read().decode(‘gbk‘) soup = BeautifulSoup(date, ‘html.parser‘).find(‘div‘, attrs={‘id‘: ‘content‘}) # 对正文进行处理,去除多余的字 novel = str(soup).replace(‘<br/>‘, ‘‘).replace(‘<div id="content">‘, ‘‘).replace(‘</div>‘, ‘‘) #换成相对应的保存地址。最好是绝对路径 filepath = ‘../Day02/novel_剑来/剑来/%s‘%(ch_name) with open(filepath, ‘w‘, encoding=‘utf-8‘) as f: f.write(novel) f.close() print(‘%s-》缓存完成‘%(ch_name))
##换成相对应的书本详情页的网址链接 url = ‘https://www.bequge.com/3_3109/‘ #解码可能是utf-8,如果保存请换成utf-8 page = request.urlopen(url).read().decode(‘gbk‘) soup = BeautifulSoup(page,‘html.parser‘) chapter_1 = soup.find_all(‘dd‘) chapter_2 = chapter_1[9:] for ch in chapter_2: str1 = str(ch) url = ‘https://www.bequge.com‘+BeautifulSoup(str(ch),‘html.parser‘).a[‘href‘] chapter_name = str(ch.string) print(url,ch.string) secondOpenURL(url,chapter_name
用的BeautifulSoup写的,比较简单。写得也不好,多见谅。剑来!
标签:replace urllib pre page 解码 比较 int 绝对路径 htm
原文地址:https://www.cnblogs.com/970401xcj/p/10472611.html