1.tornado实现高并发爬虫

时间：2018-12-18 02:34:01 阅读：219 评论：0 收藏：0 [点我收藏+]

标签：fetch res strip next set rom syn 消费者 with

from pyquery import PyQuery as pq
from tornado import ioloop, gen, httpclient, queues
from urllib.parse import urljoin


base_url = "http://www.baidu.com"
concurrency = 8


async def get_url_links(url):
    response = await httpclient.AsyncHTTPClient().fetch(url)
    html = response.body.decode("utf-8")
    p = pq(html)
    links = []
    for i in range(10000):
        if str(p("a").eq(i)).strip():
            links.append(urljoin(base_url, p("a").eq(i).attr("href")))
            continue
        break
    return links


async def main():
    seen_set = set()
    q = queues.Queue()

    async def fetch_url(current_url):
        if current_url in seen_set:
            return

        print(f"获取：{current_url}")
        seen_set.add(current_url)

        next_urls = await get_url_links(current_url)
        for next_url in next_urls:
            #if next_url.startswith(base_url):
                await q.put(next_url)

    async def worker():
        async for url in q:
            if url is None:
                return
            try:
                await fetch_url(url)
            except Exception as e:
                print(f"exception:{e}")
            finally:
                # 计数器，每进入一个就加1，所以我们调用完了之后，要减去1
                q.task_done()

    # 放入初始url到队列
    await q.put(base_url)

    # 启动协程，同时开启三个消费者
    workers = gen.multi([worker() for _ in range(3)])

    # 会阻塞，直到队列里面没有数据为止
    await q.join()

    for _ in range(concurrency):
        await q.put(None)

    # 等待所有协程执行完毕
    await workers


if __name__ == ‘__main__‘:
    ioloop.IOLoop.current().run_sync(main)

1.tornado实现高并发爬虫

标签：fetch res strip next set rom syn 消费者 with

原文地址：https://www.cnblogs.com/traditional/p/10134594.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行