标签:结合 使用 pen arc port agent object int cio
在学习python协程的过程中,结合生成器函数,实现了新浪新闻的深度爬取,深度爬取可以一边获得新生成的url,一边向URL发出请求,下面上代码
import aiohttp
from lxml import etree
import csv
import asyncio
import os
from loguru import logger
class Sina(object):
headers = {
‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36‘,
}
model_url=‘https://search.sina.com.cn/?q={}&c=news&from=&col=&range=all&source=&country=&size=10&stime=&etime=&time=&dpc=0&a=&ps=0&pf=0&page=1‘
def __init__(self):
self.params_list=open(r‘C:\Users\360技嘉电竞\Desktop\sina.txt‘,encoding=‘utf-8‘).readline().split(‘,‘)
async def fectch(self,url,sem):
async with sem:
try:
async with aiohttp.ClientSession(headers=self.headers) as session:
async with session.get(url) as res:
html= await res.text()
infos=etree.HTML(html)
print(infos.xpath(‘//*[@id="result"]/div/h2/a/@href‘))
for url in infos.xpath(‘//*[@id="result"]/div/h2/a/@href‘):
yield url
except Exception as e:
logger.debug(e)
print("发生异常")
print(infos.xpath(‘/html/body/p[6]/text()‘))
async def crawl(self,url,sem):
async for new_url in self.fectch(url,sem):
async with aiohttp.ClientSession(headers=self.headers) as session:
async with session.get(new_url) as res:
html= await res.text()
return html
def callback(self,task):
print(task)
async def main(self):
tasks = list()
sem = asyncio.Semaphore(10)
for word in self.params_list:
task = asyncio.ensure_future(self.crawl(self.model_url.format(word),sem))
task.add_done_callback(self.callback)
tasks.append(task)
await asyncio.wait(tasks)
def run(self):
loop=asyncio.new_event_loop()
asyncio.set_event_loop(loop)
loop.run_until_complete(self.main())
s=Sina()
s.run()
标签:结合 使用 pen arc port agent object int cio
原文地址:https://www.cnblogs.com/sacreddoll/p/14867934.html