Python搜索B站视频并且爬取

时间：2021-01-20 12:02:16 阅读：0 评论：0 收藏：0 [点我收藏+]

标签：安装 ica content down video webp iter div encoding

目前遍历循环仍未完成，所以只会下载第一个结果，后续会完善成接口可以做单独调用，其中还有音频和视频的合并，需要先安装ffmpeg环境

# -*- coding: utf-8 -*-

import requests
from urllib import parse,request
import urllib.request
from bs4 import BeautifulSoup
import re
import os
import subprocess
import time
import json
import sys
import io
import ffmpeg

sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding=‘utf-8‘)


class BiliBili(object):
    def __init__(self, url):
        self.url = url


    def html(self, url):
        headers = {
            ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.74 Safari/537.36 Edg/79.0.309.43‘,
            ‘Accept‘: ‘text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8‘,
            ‘Accept-Language‘: ‘zh-CN,zh;q=0.8‘,
            ‘Connection‘: ‘keep-alive‘,
        }
        html = requests.get(url, headers=headers)
        html = html.text
        #print(html)
        return html

    def get_video_html(self,url):
        headers = {
            ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.74 Safari/537.36 Edg/79.0.309.43‘,
            ‘Accept‘: ‘text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8‘,
            ‘Accept-Language‘: ‘zh-CN,zh;q=0.8‘,
            ‘Connection‘: ‘keep-alive‘,
            # ‘Cookie‘:cookie
        }
        response = requests.get(url, headers=headers)
        video_html = response.text
        return video_html

    def get_video_info(selfs, html):
        result = re.findall(‘<script>window.__playinfo__=(.*?)</script>‘, html, re.S)[0]
        html_data = json.loads(result)
        download_video_url = html_data[‘data‘][‘dash‘][‘video‘][0][‘backup_url‘][0]
        return download_video_url

    def get_audio_info(self, html):
        result = re.findall(‘<script>window.__playinfo__=(.*?)</script>‘, html, re.S)[0]
        html_data = json.loads(result)
        download_audio_url = html_data[‘data‘][‘dash‘][‘audio‘][0][‘backup_url‘][0]
        return download_audio_url

    def search_video_info(self, html):
        soup = BeautifulSoup(html,"html.parser")
        for tag in soup.find_all(‘div‘, class_=‘info‘):
            title = tag.find(‘a‘,class_=‘title‘).get_text()
            people_num = tag.find(‘span‘, class_=‘so-icon watch-num‘).get_text()
            up_name = tag.find(‘a‘,class_=‘up-name‘).get_text()
            video_url = tag.find(‘a‘).get(‘href‘)
            video_url = video_url.replace(‘//‘,‘‘)
            return title, video_url

    def search_video(self, html):
        title, video_url = self.search_video_info(html)
        print(title)
        print(video_url)
        print(url)
        self.run_video(title, video_url, url)

    def run_search(self):
        #获取搜索结果，根据搜索结果获得视频链接
        html =self.html(url)
        self.search_video(html)


    def run_video(self,title, video_url,url):
        # 根据结果传入来获得视频下载链接
        video_size = 0
        audio_size = 0
        print("视频名称：" + title)
        print(url)
        print(‘https://‘+ video_url)
        get_video_html = self.get_video_html(‘https://‘+ video_url)
        download_video_url = self.get_video_info(get_video_html)
        download_audio_url = self.get_audio_info(get_video_html)
        headers = {
            ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0‘,
            ‘Accept‘: ‘text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8‘,
            ‘Referer‘: ‘https://‘+ video_url,
            ‘Accept-Encoding‘:"gzip, deflate, br",
            ‘Accept-Language‘: ‘zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2‘,
            ‘Connection‘: ‘keep-alive‘,
            #‘Cookie‘:cookie
        }
        video_content = requests.get(download_video_url, stream=True, headers=headers)
        mp4_file_size = int(video_content.headers[‘content-length‘])
        if video_content.status_code == 200:
            print(‘[文件大小]:%0.2f MB‘ %(mp4_file_size / 1024 / 1024))
            with open(title + ‘.mp4‘, mode=‘wb‘) as mp4:
                for chunk in video_content.iter_content(chunk_size=1024):
                    if chunk:
                        mp4.write(chunk)
                        video_size += len(chunk)  # 已下载的文件大小

        audio_content = requests.get(download_audio_url, stream=True, headers=headers)
        mp3_file_size = int(audio_content.headers[‘content-length‘])
        if audio_content.status_code == 200:
            print(‘[文件大小]:%0.2f MB‘ % (mp3_file_size / 1024 / 1024))
            with open(title + ‘.mp3‘, mode=‘wb‘) as mp3:
                for chunk in audio_content.iter_content(chunk_size=1024):
                    if chunk:
                        mp3.write(chunk)
                        audio_size += len(chunk)

        print(‘正在保存：‘, title)
        self.video_audio_merge_single(title)

    def video_audio_merge_single(self, video_name):
        #合成视频
        print(‘视频合成开始：‘,video_name)
        ffm = r"D:\sofware\ffmpeg-4.3.1-2021-01-01-full_build\bin\ffmpeg.exe "
        command = ffm + ‘ -i "{}.mp4" -i "{}.mp3" -vcodec copy -acodec copy "{}.mp4"‘.format(
            video_name, video_name, video_name + ‘(合)‘)
        subprocess.Popen(command, shell=True)
        print(command)
        time.sleep(10)
        print("视频合成结束：", video_name)


if __name__ ==‘__main__‘:
    url = ‘https://search.bilibili.com/all?‘
    keyword = ‘哈哈哈哈哈‘#需要搜索的视频名称
    keyword = urllib.parse.quote(keyword)
    param = ‘keyword=‘ + keyword + ‘&from_source=nav_searchs&pm_id_from=333.851.b_696e7465726e6174696f6e616c486561646572.15‘
    url = url + param
    BB = BiliBili(url)
    BB.run_search()

Python搜索B站视频并且爬取

标签：安装 ica content down video webp iter div encoding

原文地址：https://www.cnblogs.com/duanminkid/p/14300350.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行