码迷,mamicode.com
首页 > 其他好文 > 详细

爬取梨视频主页所有视频

时间:2020-01-02 20:58:12      阅读:111      评论:0      收藏:0      [点我收藏+]

标签:__name__   idt   lin   index   uid   executor   for   top   exec   

import requests
import re
import uuid
from concurrent.futures import ThreadPoolExecutor

pool = ThreadPoolExecutor(50)


# 爬虫三部曲
# 1.发送请求
def get_html(url):
    print(f'start: {url}...')
    response = requests.get(url)
    return response

# 2.解析数据
# 解析主页,获取视频详情页url
def parse_index(response):
    '''
    <a href="(.*?)" class="vervideo-lilink actplay" target="_blank">.*?<div class="vervideo-title">(.*?)</div>.*?</a>
    '''
    # 获取电影所有的id
    movie_id_list = re.findall(
        '<a href="video_(.*?)"',  # video_1637397
        response.text,
        re.S
    )
    return list(set(movie_id_list))


# from concurrent.futures._base import Future
# 解析视频详情页,获取真实视频url
def parse_detail(res):  # res对象 --- 》 {'result': response}
    # print(type(res))

    # print(res.__dict__)
    # print('*'*100)
    response = res.result()
    # 通过回调得到的response参数是一个对象
    '''
    <video webkit-playsinline="" playsinline="" x-webkit-airplay="" autoplay="autoplay" src="https://video.pearvideo.com/mp4/adshort/20191228/cont-1637151-14745612_adpkg-ad_hd.mp4" style="width: 100%; height: 100%;"></video>
    '''
    movie_url = re.findall('srcUrl="(.*?)"', response.text, re.S)[0]

    print('是否到此处了')
    # 异步提交任务爬取真实视频数据,并保存
    pool.submit(save_movie, movie_url)


# 3.保存数据
def save_movie(movie_url):
    print('start')
    movie_response = get_html(movie_url)

    # print(movie_response.text)
    # print(movie_response.content)

    with open(f'{str(uuid.uuid4())}.mp4', 'wb') as f:
        for line in movie_response.iter_content():
            f.write(line)

    print('end...')

if __name__ == '__main__':
    import time
    index_url = 'https://www.pearvideo.com/'
    response = get_html(index_url)
    # 1.对梨视频主页进行解析,提取所有视频详情页的绝对路径
    movie_id_list = parse_index(response)

    for movie_id in movie_id_list:
        detail_url = 'https://www.pearvideo.com/video_' + movie_id
        time.sleep(0.1)
        # 循环并发异步提交任务, add_done_callback将get_html任务的执行结果,回调给
        pool.submit(get_html, detail_url).add_done_callback(parse_detail)

爬取梨视频主页所有视频

标签:__name__   idt   lin   index   uid   executor   for   top   exec   

原文地址:https://www.cnblogs.com/chanyuli/p/12135616.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!