异步爬虫

异步基础

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
import asyncio
import time

#设置事件循环策略,默认策略可能报错RuntimeError: Event loop is closed
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())

#异步协程函数
async def fn(x):
#所有耗时操作都要await挂起,暂停当前协程函数,控制权给事件循环
await asyncio.sleep(x)

async def main():
#任务列表
tasks = []
for i in range(1, 4):
#create_task将协程对象包装成任务
tasks.append(asyncio.create_task(fn(i)))
await asyncio.wait(tasks)

if __name__ == "__main__":
start_time = time.time()
#启动协程
asyncio.run(main())
print(time.time() - start_time)

其它异步模块

1
2
3
4
5
6
7
8
9
10
11
12
13
import aiofiles
import aiohttp
import asyncio

#控制并发量为50
semaphore = asyncio.Semaphore(50)

async def download():
async with semaphore:
async with aiofiles.open("test.txt", "wb") as f: #异步读写文件
async with aiohttp.ClientSession() as session: #相当于requests
async with await session.get(url="url",headers={}) as resp: #可以get/post
await f.write(await resp.content.read()) #相当于requests中的response