标签:
1 # coding:utf-8 2 print(‘正在初始化...‘) 3 import requests 4 import re 5 from lxml.html import fromstring 6 import pyautogui 7 import sys 8 import os 9 from multiprocessing.dummy import Pool 10 from lxml import _elementpath 11 12 13 class Youku_comment: 14 15 """docstring for Youku_comment""" 16 17 def __init__(self, raw_url): 18 19 self.pid = re.findall(‘/id_(.*?)\.html‘, raw_url)[0] 20 r1 = requests.get(raw_url) 21 title = re.findall(‘<title>(.*?)</title>‘, r1.text)[0] 22 title = re.sub(‘\W‘, ‘‘, title).replace(‘在线播放优酷网视频高清在线观看‘, ‘‘) 23 totalpn = self.get_totalpn(self.pid) 24 print(‘视频ID:%s‘ % self.pid, ‘\n视频标题:%s‘ % 25 title, ‘\n总页码数:%s\n正在抓取...‘ % totalpn) 26 27 pp = Pool(30) 28 pagenums = range(1, totalpn + 1) 29 result = pp.map(self.get_comment, pagenums) 30 pp.close() 31 pp.join() 32 result = [i for i in result if i] 33 self.aa = sum(result, []) 34 # print(jieguo) 35 # with open(‘%s.csv‘ % title, ‘w‘, encoding=‘gbk‘) as f: 36 # f.write(jieguo.encode(‘gbk‘, ‘ignore‘).decode(‘gbk‘)) 37 38 def get_totalpn(self, pid): 39 r = requests.get( 40 ‘http://comments.youku.com/comments/~ajax/vpcommentContent.html?__ap={"videoid":"%s","page":1}‘ % pid) 41 totalpn = (int(r.json()[‘totalSize‘].replace(‘,‘, ‘‘)) // 30) + 1 42 return totalpn 43 44 def get_comment(self, pagenum): 45 for _ in range(5): 46 try: 47 r = requests.get( 48 ‘http://comments.youku.com/comments/~ajax/vpcommentContent.html?__ap={"videoid":"%s","page":%s}‘ % (self.pid, pagenum), timeout=3) 49 sjson = r.json() 50 scode = sjson[‘con‘] 51 ss = re.findall(‘<p id=".*?">.*?</p>‘, scode, flags=re.S) 52 ss = [re.sub(‘<.*?>‘, ‘‘, i) for i in ss] 53 if ss: 54 return ss 55 except: 56 pass 57 58 59 def filt1(str1, kws): 60 kws = kws.split(‘ ‘) if kws else ‘OST 背景 音乐 旋律 歌曲 调子 music 耳熟 BGM 谁唱的 来自 出自 原声‘.split( 61 ‘ ‘) 62 for i in kws: 63 if i in str1: 64 return str1 65 66 67 def quchong(ll): 68 ss = ‘‘ 69 for i in ll: 70 if i in ss: 71 continue 72 else: 73 ss = ss + ‘\n‘ + i 74 return ss 75 76 while 1: 77 try: 78 url = pyautogui.prompt(‘请输入网址:‘) 79 if not url: 80 break 81 tt = Youku_comment(url) 82 pinglun = tt.aa 83 while 1: 84 kws = pyautogui.prompt(‘请输入关键词,多个请用空格隔开(直接回车则代表找背景音乐):‘) 85 kws = kws if kws else 0 86 ss = [filt1(i, kws) for i in pinglun] 87 ss = [i for i in ss if i] 88 ss = quchong(ss) 89 print(‘检索结果:\n‘) 90 print(ss) 91 jixu = pyautogui.confirm( 92 text=‘是否要继续检索‘, title=‘请确认‘, buttons=[‘是‘, ‘否‘]) 93 if jixu == ‘否‘: 94 break 95 96 except Exception as e: 97 print(e) 98 print(‘错误,请重试‘) 99 os.system(‘pause‘)
Windows已编译可执行文件: http://pan.baidu.com/s/1bn0jLmf
Python练习:优酷评论过滤(抓取当前视频全部评论,并过滤不包括所需关键词的留言)
标签:
原文地址:http://www.cnblogs.com/pyld/p/4732311.html