标签:%s from name parent 手机版 rip open append soup
!:编码格式。编码格式。编码格式
!!:http://xiaorui.cc/2016/02/19/%E4%BB%A3%E7%A0%81%E5%88%86%E6%9E%90python-requests%E5%BA%93%E4%B8%AD%E6%96%87%E7%BC%96%E7%A0%81%E9%97%AE%E9%A2%98/
!!!:https://www.zhihu.com/question/264878732
!!!!:xx.apparent_encoding
import requests from bs4 import BeautifulSoup import re import sys article={} ll=[] def getlink(url): res=requests.get(url) res.encoding = res.apparent_encoding soup = BeautifulSoup(res.text,‘html.parser‘) for i in range(12,20): t1=soup.find_all(‘dd‘)[i] for t2 in t1: t3 = t2.get(‘href‘) #print(t3) ll.append(t3) return ll def gettext(url): res=requests.get(url) res.encoding = res.apparent_encoding li=[] soup = BeautifulSoup(res.text,‘html.parser‘) li=getlink(url) filename=soup.select(‘.info h2‘)[0].text #print(filename) #print(type(filename)) #with open("%s.txt" %filename ,‘wb+‘) as f f = open("%s.txt" %filename ,‘a‘) for k in range(0,3): #print(li[k]) link=‘http://www.biqukan.com‘+li[k] t=requests.get(link) t.apparent_encoding #t.encoding=‘gbk‘ st = BeautifulSoup(t.text,‘html.parser‘) article[‘title‘]=st.select(‘.content h1‘) [0].text article[‘content‘] = st.select(‘.showtxt‘) [0].text.replace(‘\r‘,‘ ‘).replace(‘\u3000‘,‘‘).replace(‘\xa0‘,‘‘).rstrip(‘http://www.biqukan.com/1_1094/17967679.html请记住本书首发域名:www.biqukan.com。笔趣阁手机版阅读网址:m.biqukan.com‘) #print(type(article[‘title‘])) #print(article[‘content‘]) f.write(article[‘title‘]+‘\n‘) f.write(article[‘content‘]+‘\n‘) f.close() url=‘http://www.biqukan.com/1_1094/‘ gettext(url)
标签:%s from name parent 手机版 rip open append soup
原文地址:https://www.cnblogs.com/leolaosao/p/9095746.html