标签:文本 htm soup 网页 rip bsp write tle name
1 #encoding:UTF-8 2 import urllib 3 import urllib.request 4 import bs4 5 from bs4 import BeautifulSoup as bs 6 def test1(): 7 url = "http://www.stylusstudio.com/edifact/D95B/CODECO.htm" 8 resp = urllib.request.urlopen(url) 9 data = resp.read().decode(‘UTF-8‘) 10 soup = bs(data, ‘html.parser‘) 11 segment11= soup.find_all(‘table‘) 12 segment1=segment11[7].find_all(‘tr‘)#表示第几个table,此时表示进去html网页中的第7个table 13 14 15 f2=open(‘./text1.txt‘,‘a‘,encoding=‘cp852‘) 16 for item in segment1: 17 18 print(item) 19 ‘‘‘ 20 <tr class="FrameTreeFont"><td><span class="FrameDrawFont">│ 21 <span class="FrameHideFont">─</span>│<span class="FrameHideFont">─</span>├─</span> 22 <a class="FrameItemFont" href="DAM_.htm" target="classFrame" title="Damage">DAM</a> 23 Damage</td><td align="right"><span class="FrameDetailFont"> ×1 24 </span></td><td><span class="FrameDetailFont">(M)</span></td></tr> 25 ‘‘‘ 26 27 print(item.get_text())#以文本方式呈现 28 ‘‘‘ 29 │─│─├─DAM Damage ×1 (M) 30 ‘‘‘ 31 # print(item.td.span.get_text())#获取具体标签内部内容 32 print([text for text in item.stripped_strings] )#以列表方式呈现 33 ‘‘‘ 34 [‘│‘, ‘─‘, ‘│‘, ‘─‘, ‘├─‘, ‘DAM‘, ‘Damage‘, ‘×1‘, ‘(M)‘] 35 ‘‘‘ 36 ‘‘‘ 37 soup.get_text("|")#u‘\nI linked to |example.com|\n‘进一步,通过strip去除掉文本每个位的头尾空白。 38 39 soup.get_text("|", strip=True)#u‘I linked to|example.com‘ 40 ‘‘‘ 41 f2.writelines(str([text for text in item.stripped_strings])+‘\n‘) 42 f2.close() 43 if __name__==‘__main__‘: 44 test1()
标签:文本 htm soup 网页 rip bsp write tle name
原文地址:http://www.cnblogs.com/smuxiaolei/p/7417384.html