标签:img roo cep parser 包名 == orm 爬取 key
1 from bs4 import BeautifulSoup 2 import os 3 import requests 4 5 #获取页面内容 6 def getHtmlText(url, s=‘text‘): 7 try: 8 r = requests.get(url, timeout=30) 9 r.raise_for_status() 10 r.encoding = r.apparent_encoding 11 if s == ‘text‘: 12 return r.text 13 elif s == ‘content‘: 14 return r.content 15 else: 16 return ‘‘ 17 except: 18 return "" 19 20 21 #获取表情包名字与表情包套链接 22 def getEmotionInfo(html): 23 soup = BeautifulSoup(html, ‘html.parser‘) 24 emo_divs = soup.find_all(‘div‘, attrs={‘class‘:‘up‘}) 25 for div in emo_divs: 26 a = div.find(‘div‘, attrs={‘class‘:‘num_1‘}).find(‘a‘) 27 title = a.attrs[‘title‘] 28 href = a.attrs[‘href‘] 29 getEmotionImgInfo(title, href) 30 31 #获取表情包中每一个图片的链接 32 def getEmotionImgInfo(title, href): 33 html = getHtmlText(href) 34 soup = BeautifulSoup(html, ‘html.parser‘) 35 img_div = soup.find(‘div‘, attrs={‘class‘:‘img_text‘}).next_sibling.next_sibling 36 imgs = img_div.find_all(‘img‘) 37 url_list = [] 38 for img in imgs: 39 src = img.attrs[‘src‘] 40 url_list.append(src) 41 getImg(title, url_list) 42 43 #获取表情包保存在本地 44 def getImg(title, url_list): 45 root = ‘D://pics//‘ + title 46 if not os.path.exists(root): 47 os.mkdir(root) 48 count_small = 0 49 for key in url_list: 50 path = root +‘//‘+ key.split(‘/‘)[-1] 51 if not os.path.exists(path): 52 img_content = getHtmlText(key,‘content‘) 53 with open(path, ‘wb‘) as f: 54 f.write(img_content) 55 count_small = count_small + 1 56 print(‘\r{}文件进度:{:.2f}%‘.format(title, count_small*100/len(url_list)),end=‘,‘) 57 58 if __name__ == ‘__main__‘: 59 first_url = ‘http://sc.chinaz.com/biaoqing/index.html‘ 60 root_url = ‘http://sc.chinaz.com/biaoqing/index_‘ 61
62 pages = 20 63 for i in range(1,pages): #切换页面爬取内容 64 if i == 1: 65 html = getHtmlText(first_url) 66 else: 67 url = root_url + str(i) + ‘.html‘ 68 html = getHtmlText(url) 69 getEmotionInfo(html)
标签:img roo cep parser 包名 == orm 爬取 key
原文地址:http://www.cnblogs.com/jp-mao/p/6759005.html