标签:fetch res strip next set rom syn 消费者 with
from pyquery import PyQuery as pq from tornado import ioloop, gen, httpclient, queues from urllib.parse import urljoin base_url = "http://www.baidu.com" concurrency = 8 async def get_url_links(url): response = await httpclient.AsyncHTTPClient().fetch(url) html = response.body.decode("utf-8") p = pq(html) links = [] for i in range(10000): if str(p("a").eq(i)).strip(): links.append(urljoin(base_url, p("a").eq(i).attr("href"))) continue break return links async def main(): seen_set = set() q = queues.Queue() async def fetch_url(current_url): if current_url in seen_set: return print(f"获取:{current_url}") seen_set.add(current_url) next_urls = await get_url_links(current_url) for next_url in next_urls: #if next_url.startswith(base_url): await q.put(next_url) async def worker(): async for url in q: if url is None: return try: await fetch_url(url) except Exception as e: print(f"exception:{e}") finally: # 计数器,每进入一个就加1,所以我们调用完了之后,要减去1 q.task_done() # 放入初始url到队列 await q.put(base_url) # 启动协程,同时开启三个消费者 workers = gen.multi([worker() for _ in range(3)]) # 会阻塞,直到队列里面没有数据为止 await q.join() for _ in range(concurrency): await q.put(None) # 等待所有协程执行完毕 await workers if __name__ == ‘__main__‘: ioloop.IOLoop.current().run_sync(main)
标签:fetch res strip next set rom syn 消费者 with
原文地址:https://www.cnblogs.com/traditional/p/10134594.html