码迷,mamicode.com
首页 > 其他好文 > 详细

【12.8】asyncio高并发爬虫

时间:2019-08-24 18:44:06      阅读:79      评论:0      收藏:0      [点我收藏+]

标签:glob   insert   ems   exec   for   lob   future   session   asyncio   

  1 #!/usr/bin/env python
  2 # -*- coding:utf-8 -*-
  3 # asyncio爬虫、去重、入库
  4 
  5 import asyncio
  6 import re
  7 
  8 import aiohttp
  9 import aiomysql
 10 from pyquery import PyQuery
 11 from aiohttp import TCPConnector
 12 
 13 
 14 start_url = http://www.xbiquge.la/paihangbang/
 15 waitting_urls = []
 16 seen_urls = set()
 17 stopping = False
 18 
 19 sem = asyncio.Semaphore(3)
 20 
 21 
 22 async def fetch(url, session):
 23     async with sem:
 24         try:
 25             async with session.get(url) as resp:
 26                 print(url status: {}.format(resp.status))
 27                 if resp.status in [200, 201]:
 28                     data = await resp.text()
 29                     return data
 30         except Exception as e:
 31             print(e)
 32 
 33 
 34 def extract_urls(html):
 35     pq = PyQuery(html)
 36     for link in pq.items(a):
 37         url = link.attr(href)
 38         if url and url.startswith(http) and url not in seen_urls:
 39             global waitting_urls
 40             waitting_urls.append(url)
 41 
 42 
 43 async def init_urls(url, session):
 44     html = await fetch(url, session)
 45     seen_urls.add(url)
 46     extract_urls(html)
 47 
 48 
 49 async def article_handle(url, session, pool):
 50     # 获取文章详情并解析入库
 51     html = await fetch(url, session)
 52     seen_urls.add(url)
 53     extract_urls(html)
 54     pq = PyQuery(html)
 55     title = pq("title").text()
 56     # title = title + ‘\n‘
 57     async with pool.acquire() as conn:
 58         async with conn.cursor() as cur:
 59             insert_sql = "INSERT INTO article_test VALUES(‘{}‘)".format(title)
 60             print(insert_sql)
 61             await cur.execute(insert_sql)
 62     # 文件操作
 63     # with open(‘aiohttp_spider.txt‘, mode=‘a‘, encoding=‘utf-8‘) as file_object:
 64     #     file_object.write(title)
 65 
 66 
 67 async def consumer(pool, session):
 68     while not stopping:
 69         if len(waitting_urls) == 0:
 70                 await asyncio.sleep(0.5)
 71                 continue
 72 
 73         url = waitting_urls.pop()
 74         print(start get url: {}.format(url))
 75         if re.match(http://www.xbiquge.la/\d+/\d+/, url):
 76             if url not in seen_urls:
 77                 asyncio.ensure_future(article_handle(url, session, pool))
 78         else:
 79             if url not in seen_urls:
 80                 asyncio.ensure_future(init_urls(url, session))
 81 
 82 
 83 async def main(loop):
 84     # 等待mysql连接建立好
 85     pool = await aiomysql.create_pool(host=127.0.0.1, port=3306,
 86                                       user=root, password=123456,
 87                                       db=aiomysql_test, loop=loop,
 88                                       charset=utf8, autocommit=True)
 89 
 90     session = aiohttp.ClientSession()
 91     html = await fetch(start_url, session)
 92     seen_urls.add(start_url)
 93     extract_urls(html)
 94 
 95     asyncio.ensure_future(consumer(pool, session))
 96 
 97 
 98 if __name__ == __main__:
 99     loop = asyncio.get_event_loop()
100     asyncio.ensure_future(main(loop))
101     loop.run_forever()

 

【12.8】asyncio高并发爬虫

标签:glob   insert   ems   exec   for   lob   future   session   asyncio   

原文地址:https://www.cnblogs.com/zydeboke/p/11405336.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!