1 # coding = utf-8 2 3 __autor__ = ‘litao‘ 4 5 import random, requests 6 import logging 7 import traceback 8 import time 9 import re 10 from lxml import etree 11 from logMaker import Logger 12 13 USER_AGENTS = [ 14 "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", 15 "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)", 16 "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", 17 "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)", 18 "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", 19 "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", 20 "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", 21 "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", 22 "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", 23 "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", 24 "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", 25 "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5", 26 "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6", 27 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", 28 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20", 29 "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52", 30 ] 31 32 headers = { 33 "User-Agent": random.choice(USER_AGENTS) 34 } 35 36 log = Logger(‘spider.log‘, logging.ERROR, logging.DEBUG) 37 Type = re.compile(r‘subtype":\[(.*?)\],‘) 38 39 40 def get_html(url, agent=True): 41 """ 42 获取url对应的网页的源码 43 :param url: url 44 :return: html 45 """ 46 try: 47 html = requests.get(url=url, headers=headers, timeout=10).text 48 log.debug("url: %s请求成功!" % url) 49 except Exception: 50 if agent == True: 51 log.error("url: %s访问出错,出错原因如下即将重试" % url) 52 log.error(traceback.print_exc()) 53 get_html(url, agent=False) 54 if agent == False: 55 html = None 56 time.sleep(random.randint(5,9))#random.randint(5,15),random.randint(16,35) 57 return html 58 59 60 def get_info(url_and_title_list): 61 """ 62 获取所需信息 63 :param url_and_title_list: 本次需要抓取内容的url列表 64 :return: None 65 """ 66 try: 67 for url, title in url_and_title_list: 68 html = get_html(url) 69 selector = etree.HTML(html) 70 print(selector.xpath("//a[@class=‘tag_item‘]/text()")[2:]) 71 try: 72 type = re.findall(Type, html)[0] 73 except Exception as e: 74 log.error("%s电视剧分为为空!" % title) 75 type = ‘‘ 76 content = url + ‘,‘ + title + ‘,‘ + type 77 print(content) 78 with open(‘result.txt‘, ‘a‘, encoding=‘gbk‘) as f: 79 f.writelines(content+‘\n‘) 80 log.debug("向文件中写入: %s" % content) 81 except Exception: 82 log.error("获取信息过程中出错,出错原因如下") 83 log.error(traceback.print_exc()) 84 85 86 def get_all_url(): 87 """ 88 获取所有电视剧下所有url 89 :return: None 90 """ 91 page_number_list = random.sample(range(117), 117) 92 for page_number in page_number_list: 93 log.debug("开始爬去电视剧下第%d页内容" % page_number) 94 url = ‘http://v.qq.com/x/list/tv?sort=18&iarea=-1&offset={offset}‘.format(offset=page_number * 30) 95 html = get_html(url) 96 if html != None: 97 seletor = etree.HTML(html) 98 child_url_list = seletor.xpath("//ul[@class=‘figures_list‘]/li/a/@href") 99 name = seletor.xpath("//strong[@class=‘figure_title‘]/a/@title") 100 get_info(zip(child_url_list, name)) 101 102 103 if __name__ == "__main__": 104 get_all_url() 105 with open(‘result.txt‘, ‘a‘, encoding=‘utf-8‘) as f: 106 f.writelines(‘https://v.qq.com/x/cover/5tjct4561pq7zan.html,热剧精彩周边,"创意剪辑"‘)
1 # coding = utf-8 2 3 __autor__ = ‘litao‘ 4 5 import logging 6 7 8 class Logger: 9 def __init__(self, path, clevel=logging.DEBUG, Flevel=logging.DEBUG): 10 self.logger = logging.getLogger(path) 11 self.logger.setLevel(logging.DEBUG) 12 fmt = logging.Formatter(‘[%(asctime)s] [%(levelname)s] %(message)s‘, ‘%Y-%m-%d %H:%M:%S‘) 13 # 设置CMD日志 14 sh = logging.StreamHandler() 15 sh.setFormatter(fmt) 16 sh.setLevel(clevel) 17 # 设置文件日志 18 fh = logging.FileHandler(path) 19 fh.setFormatter(fmt) 20 fh.setLevel(Flevel) 21 self.logger.addHandler(sh) 22 self.logger.addHandler(fh) 23 24 def debug(self, message): 25 self.logger.debug(message) 26 27 def info(self, message): 28 self.logger.info(message) 29 30 def war(self, message): 31 self.logger.warn(message) 32 33 def error(self, message): 34 self.logger.error(message) 35 36 def cri(self, message): 37 self.logger.critical(message) 38 39 40 if __name__ == ‘__main__‘: 41 logyyx = Logger(‘yyx.log‘, logging.ERROR, logging.DEBUG) 42 logyyx.debug(‘一个debug信息‘) 43 logyyx.info(‘一个info信息‘) 44 logyyx.war(‘一个warning信息‘) 45 logyyx.error(‘一个error信息‘) 46 logyyx.cri(‘一个致命critical信息‘)
# coding = utf-8
__autor__ = ‘litao‘
import random, requests
import logging
import traceback
import time
import re
from lxml import etree
from logMaker import Logger
USER_AGENTS = [
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
]
headers = {
"User-Agent": random.choice(USER_AGENTS)
}
log = Logger(‘spider.log‘, logging.ERROR, logging.DEBUG)
Type = re.compile(r‘subtype":\[(.*?)\],‘)
def get_html(url, agent=True):
"""
获取url对应的网页的源码
:param url: url
:return: html
"""
try:
html = requests.get(url=url, headers=headers, timeout=10).text
log.debug("url: %s请求成功!" % url)
except Exception:
if agent == True:
log.error("url: %s访问出错,出错原因如下即将重试" % url)
log.error(traceback.print_exc())
get_html(url, agent=False)
if agent == False:
html = None
time.sleep(random.randint(5,9))#random.randint(5,15),random.randint(16,35)
return html
def get_info(url_and_title_list):
"""
获取所需信息
:param url_and_title_list: 本次需要抓取内容的url列表
:return: None
"""
try:
for url, title in url_and_title_list:
html = get_html(url)
selector = etree.HTML(html)
print(selector.xpath("//a[@class=‘tag_item‘]/text()")[2:])
try:
type = re.findall(Type, html)[0]
except Exception as e:
log.error("%s电视剧分为为空!" % title)
type = ‘‘
content = url + ‘,‘ + title + ‘,‘ + type
print(content)
with open(‘result.txt‘, ‘a‘, encoding=‘gbk‘) as f:
f.writelines(content+‘\n‘)
log.debug("向文件中写入: %s" % content)
except Exception:
log.error("获取信息过程中出错,出错原因如下")
log.error(traceback.print_exc())
def get_all_url():
"""
获取所有电视剧下所有url
:return: None
"""
page_number_list = random.sample(range(117), 117)
for page_number in page_number_list:
log.debug("开始爬去电视剧下第%d页内容" % page_number)
url = ‘http://v.qq.com/x/list/tv?sort=18&iarea=-1&offset={offset}‘.format(offset=page_number * 30)
html = get_html(url)
if html != None:
seletor = etree.HTML(html)
child_url_list = seletor.xpath("//ul[@class=‘figures_list‘]/li/a/@href")
name = seletor.xpath("//strong[@class=‘figure_title‘]/a/@title")
get_info(zip(child_url_list, name))
if __name__ == "__main__":
get_all_url()
with open(‘result.txt‘, ‘a‘, encoding=‘utf-8‘) as f:
f.writelines(‘https://v.qq.com/x/cover/5tjct4561pq7zan.html,热剧精彩周边,"创意剪辑"‘)