标签:tin values spi safari creat 基于 gen creating accesslog
代码(不包含蜘蛛):
# cat top_10_useragent.py #!/usr/bin/env python # coding=utf-8 from mrjob.job import MRJob from mrjob.step import MRStep from nginx_accesslog_parser import NginxLineParser import heapq class UserAgent(MRJob): nginx_line_parser = NginxLineParser() def mapper(self, _, line): self.nginx_line_parser.parse(line) field_item = self.nginx_line_parser.http_user_agent if field_item is not None: yield field_item, 1 def reducer_sum(self, key, values): yield None, (sum(values), key) def reducer_top100(self, _, values): for count, path in heapq.nlargest(10, values): yield count, path # for count, path in sorted(values, reverse=True)[:10]: # yield count, path def steps(self): return ( MRStep(mapper=self.mapper, reducer=self.reducer_sum ), MRStep(reducer=self.reducer_top100) ) def main(): UserAgent.run() if __name__ == ‘__main__‘: main()
结果:
# python3 top_10_useragent.py access_all.log-20161227 No configs found; falling back on auto-configuration Creating temp directory /tmp/top_10_useragent.root.20161228.090725.308144 Running step 1 of 2... Running step 2 of 2... Streaming final output from /tmp/top_10_useragent.root.20161228.090725.308144/output... 85262 "IE" 79611 "Chrome" 48560 "Other" 10662 "Firefox" 7927 "Mobile Safari UI/WKWebView" 7182 "Sogou Explorer" 6681 "QQ Browser" 1988 "Mobile Safari" 1781 "Maxthon" 1404 "Edge" Removing temp directory /tmp/top_10_useragent.root.20161228.090725.308144...
#!/usr/bin/env python # coding=utf-8 from mrjob.job import MRJob from mrjob.step import MRStep from nginx_accesslog_parser import NginxLineParser import heapq class Spider(MRJob): nginx_line_parser = NginxLineParser() def mapper(self, _, line): self.nginx_line_parser.parse(line) field_item = self.nginx_line_parser.user_agent_type if field_item is not None: yield field_item, 1 def reducer_sum(self, key, values): yield None, (sum(values), key) def reducer_top100(self, _, values): for count, path in heapq.nlargest(10, values): yield count, path # for count, path in sorted(values, reverse=True)[:10]: # yield count, path def steps(self): return ( MRStep(mapper=self.mapper, reducer=self.reducer_sum ), MRStep(reducer=self.reducer_top100) ) def main(): Spider.run() if __name__ == ‘__main__‘: main()
执行结果:
# python3 top_10_spider.py access_all.log-20161227 No configs found; falling back on auto-configuration Creating temp directory /tmp/top_10_spider.root.20161228.091326.295972 Running step 1 of 2... Running step 2 of 2... Streaming final output from /tmp/top_10_spider.root.20161228.091326.295972/output... 33542 "magpie-crawler" 25880 "Other" 16578 "Sogou web spider" 6383 "bingbot" 3688 "Baiduspider" 1487 "Yahoo! Slurp" 1096 "JikeSpider" 731 "YisouSpider" 648 "Baiduspider-image" 470 "Googlebot" Removing temp directory /tmp/top_10_spider.root.20161228.091326.295972...
五、基于hadoop的nginx访问日志分析--userAgent和spider
标签:tin values spi safari creat 基于 gen creating accesslog
原文地址:http://www.cnblogs.com/xiaoming279/p/6230237.html