异步&进程池&防盗链爬取梨视频热门页视频

梨视频热门页
import requests
from lxml import etree
import random
import re
from multiprocessing.dummy import Pool
import time
import aiohttp
import asyncio
import aiofiles

url = "https://www.pearvideo.com/popular"
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"}

#获取首页响应数据
resp = requests.get(url=url, headers=headers)
tree = etree.HTML(resp.text)
#解析首页所有视频的id和视频名
contIds, names = tree.xpath('//ul[@class="popular-list"]//div[@class="popularem-ath"]/a/@href'), tree.xpath('//ul[@class="popular-list"]//div[@class="popularem-ath"]/a/h2/text()')
contIds, names = [x.split("_")[1] for x in contIds], [x + ".mp4" for x in names]

start_time = time.time()
#存视频url和视频名
url = []
#只下载热门页前三个视频
for i in range(3):
    contId, name = contIds[i], names[i]
    #查看网页源代码发现video标签是动态加载的
    #通过抓包获取ajax的url
    ajax_url = "https://www.pearvideo.com/videoStatus.jsp?"
    #请求携带视频id和随机数(请求参数中包含一个随机数)
    param = {"contId": contId, "mrd":str(random.random())}
    #处理防盗链，加入Referer上一个页面url，不然响应失败(响应失败后加请求头中没有的属性尝试)
    headers["Referer"] = "https://www.pearvideo.com/video_" + contId
    resp = requests.get(url=ajax_url, params=param,headers=headers)

    # 此处视频地址做了加密即ajax中得到的地址需要加上cont-,并且修改一段数字为id才是真地址，用正则表达式处理字符串
    # 真地址："https://video.pearvideo.com/mp4/third/20201120/cont-1708144-10305425-222728-hd.mp4"
    # 获取到的伪地址："https://video.pearvideo.com/mp4/third/20201120/1606132035863-10305425-222728-hd.mp4"
    video_url = resp.json()["videoInfo"]["videos"]["srcUrl"]
    video_url = re.sub(r"/\d{10,}", f"/cont-{contId}", video_url)
    
    url.append({"url":video_url, "name":name})

#进程池实现
def main_pool():
    #下载视频
    def get_video(url):
        name = url["name"]
        print("正在下载" + name + "...")
        video_resp = requests.get(url=url["url"], headers=headers)
        with open(name, "wb") as f:
            f.write(video_resp.content)
        print("成功下载" + name)

    #用进程池下载视频
    pool = Pool(4)
    pool.map(get_video, url)


#异步实现
def main_async():
    #所有需要耗时的地方都用await挂起
    #协程函数中必须用支持异步的模块
    async def get_video(url):
        name = url["name"]
        print("正在下载" + name + "...")
        #request替换为支持异步的aiohttp模块，使用异步上下文管理
        async with aiohttp.ClientSession() as session:
            async with await session.get(url=url["url"], headers=headers) as video_resp:
                #read()返回二进制数据 text()返回字符串 json()返回json对象
                content = await video_resp.read()
                async with aiofiles.open(name, "wb") as f:
                    await f.write(content)
        print("成功下载" + name)

    #获取事件循环
    loop = asyncio.get_event_loop()
    #创建任务列表 调用协程函数返回协程对象
    tasks = [asyncio.ensure_future(get_video(url)) for url in url]
    #任务列表不能直接运行，需要调用asyncio.wait
    loop.run_until_complete(asyncio.wait(tasks))

#异步
main_async()
#进程池
#main_pool()

end_time = time.time()
print(f"耗时{end_time - start_time}s")