异步&进程池&防盗链爬取梨视频热门页视频

梨视频热门页

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import requests
from lxml import etree
import random
import re
from multiprocessing.dummy import Pool
import time
import aiohttp
import asyncio
import aiofiles

url = "https://www.pearvideo.com/popular"
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"}

#获取首页响应数据
resp = requests.get(url=url, headers=headers)
tree = etree.HTML(resp.text)
#解析首页所有视频的id和视频名
contIds, names = tree.xpath('//ul[@class="popular-list"]//div[@class="popularem-ath"]/a/@href'), tree.xpath('//ul[@class="popular-list"]//div[@class="popularem-ath"]/a/h2/text()')
contIds, names = [x.split("_")[1] for x in contIds], [x + ".mp4" for x in names]

start_time = time.time()
#存视频url和视频名
url = []
#只下载热门页前三个视频
for i in range(3):
contId, name = contIds[i], names[i]
#查看网页源代码发现video标签是动态加载的
#通过抓包获取ajax的url
ajax_url = "https://www.pearvideo.com/videoStatus.jsp?"
#请求携带视频id和随机数(请求参数中包含一个随机数)
param = {"contId": contId, "mrd":str(random.random())}
#处理防盗链,加入Referer上一个页面url,不然响应失败(响应失败后加请求头中没有的属性尝试)
headers["Referer"] = "https://www.pearvideo.com/video_" + contId
resp = requests.get(url=ajax_url, params=param,headers=headers)

# 此处视频地址做了加密即ajax中得到的地址需要加上cont-,并且修改一段数字为id才是真地址,用正则表达式处理字符串
# 真地址:"https://video.pearvideo.com/mp4/third/20201120/cont-1708144-10305425-222728-hd.mp4"
# 获取到的伪地址:"https://video.pearvideo.com/mp4/third/20201120/1606132035863-10305425-222728-hd.mp4"
video_url = resp.json()["videoInfo"]["videos"]["srcUrl"]
video_url = re.sub(r"/\d{10,}", f"/cont-{contId}", video_url)

url.append({"url":video_url, "name":name})

#进程池实现
def main_pool():
#下载视频
def get_video(url):
name = url["name"]
print("正在下载" + name + "...")
video_resp = requests.get(url=url["url"], headers=headers)
with open(name, "wb") as f:
f.write(video_resp.content)
print("成功下载" + name)

#用进程池下载视频
pool = Pool(4)
pool.map(get_video, url)


#异步实现
def main_async():
#所有需要耗时的地方都用await挂起
#协程函数中必须用支持异步的模块
async def get_video(url):
name = url["name"]
print("正在下载" + name + "...")
#request替换为支持异步的aiohttp模块,使用异步上下文管理
async with aiohttp.ClientSession() as session:
async with await session.get(url=url["url"], headers=headers) as video_resp:
#read()返回二进制数据 text()返回字符串 json()返回json对象
content = await video_resp.read()
async with aiofiles.open(name, "wb") as f:
await f.write(content)
print("成功下载" + name)

#获取事件循环
loop = asyncio.get_event_loop()
#创建任务列表 调用协程函数返回协程对象
tasks = [asyncio.ensure_future(get_video(url)) for url in url]
#任务列表不能直接运行,需要调用asyncio.wait
loop.run_until_complete(asyncio.wait(tasks))

#异步
main_async()
#进程池
#main_pool()

end_time = time.time()
print(f"耗时{end_time - start_time}s")