Scrapy基本使用

环境准备

pip install scrapy

项目操作

在当前目录创建scrapy项目
scrapy startproject [project_name]
scrapy startproject test_scrapy

进入项目目录,生成爬虫文件
scrapy genspider [name] [url]
scrapy genspider first www.xxx.com

运行项目
scrapy crawl [name]
scrapy crawl first

配置文件

1
2
3
4
5
6
7
8
9
10
11
12
USER_AGENT设置UA

ROBOTSTXT_OBEY是否遵从robot协议

#只输出ERROR级别的log
LOG_LEVEL = "ERROR"

#300为优先级,优先级越小越先执行
#可通过增加管道类实现多平台存储
ITEM_PIPELINES = {
"test_scrapy.pipelines.TestScrapyPipeline": 300,
}

数据解析

获取定位到的首个标签response.xpath().get()
获取定位到的所有标签,返回列表response.xpath().getall()

持久化存储

基于指令的持久化存储

存储parse函数的返回值,仅支持部分文件后缀
scrapy crawl first -o test.csv

基于管道的持久化存储

  1. 配置文件中加入ITEM_PIPELINES

    1
    2
    3
    ITEM_PIPELINES = {
    "test_scrapy.pipelines.TestScrapyPipeline": 300,
    }
  2. 定义item类

    1
    2
    3
    4
    5
    6
    import scrapy

    class TestScrapyItem(scrapy.Item):
    # 按如下格式写
    # name = scrapy.Field()
    title = scrapy.Field()
  3. 实例化item类并提交给管道

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    import scrapy
    from test_scrapy.items import TestScrapyItem

    class FirstSpider(scrapy.Spider):
    name = "first"
    #定义作用域
    #allowed_domains = ["www.xxx.com"]
    #指定请求的url
    start_urls = ["https://movie.douban.com/"]

    def parse(self, response):
    titles = response.xpath('//li[@class="title"]/a/text()').getall()

    for title in titles:
    item = TestScrapyItem()
    item["title"] = title
    #提交item给管道
    yield item
  4. 定义管道类

文件存储

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
from itemadapter import ItemAdapter

class TestScrapyPipeline:
fp = None
#爬虫开始时调用一次
def open_spider(self, spider):
print("开始爬虫")
self.fp = open("test.txt", "w", encoding="utf-8")

#处理item
def process_item(self, item, spider):
title = item["title"]
self.fp.write(title + "\n")
#传递给下一个管道类
return item

#爬虫结束调用一次
def close_spider(self, spider):
print("结束爬虫")
self.fp.close()

整合mysql进行数据库存储

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
from itemadapter import ItemAdapter
import pymysql

class WynewsPipeline:
conn = None
def open_spider(self, spider):
self.conn = pymysql.connect(host="localhost", port=3306, user="root", password="123456", database="test2")

def process_item(self, item, spider):
with self.conn.cursor() as cursor:
sql = "insert into news (title, text) values (%s, %s)"
cursor.execute(sql, (item["title"], item["text"]))
self.conn.commit()
return item

def close_spider(self, spider):
self.conn.close()

如果要同时在多个平台存储,只需在pipelines.py中定义多个管道类,并在配置文件中的ITEM_PIPELINES加入类名和优先级

将新的请求加入请求队列

1
2
3
4
def parse(self, response):
...
#callback用于新请求的数据解析函数
yield scrapy.Request(url=new_url, callback=self.parse)

请求传参

1
2
3
4
5
6
7
8
9
10
11
12
#自定义
def parse_save(self, response):
...
item = reponse.meta["item"]
#持久化存储
yield item

def parse(self, response):
...
item = ...
#将item作为参数传给parse_save函数
yield scrapy.Request(url=new_url, callback=self.parse_save, meta = {"item":item})

图片爬取

使用ImagesPipeline

  1. 配置文件添加存储路径
    IMAGES_STORE = "imgs"

  2. parse函数中提交item,包含图片url

  3. 定义管道类

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    from itemadapter import ItemAdapter
    from scrapy.pipelines.images import ImagesPipeline
    import scrapy

    class ImgPipeline(ImagesPipeline):
    #根据图片url发送请求
    def get_media_requests(self, item, info):
    yield scrapy.Request(url=item["img_url"])
    #指定图片存储名
    def file_path(self, request, response=None, info=None, *, item=None):
    return item["img_url"].split("/")[-1]
    #传给下一个管道
    def item_completed(self, results, item, info):
    return item

下载中间件

作用:拦截请求和响应
首先修改配置文件,启用下载中间件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
class ImgProDownloaderMiddleware:
ua_list = [...]
proxy = [...]

#拦截正常请求
def process_request(self, request, spider):
#UA伪装
request.headers["User-Agent"] = random.choice(self.ua_list)
return None

#拦截所有响应数据
def process_response(self, request, response, spider):
return response

#拦截发生异常的请求
def process_exception(self, request, exception, spider):
#使用代理ip
request.meta["proxy"] = "http(s)://" + random.choice(self.proxy)
#重新请求
return request

下载中间件整合selenium获取动态加载的数据

爬虫文件

1
2
3
4
5
6
7
8
9
from selenium import webdriver
...

class GetNewsSpider(scrapy.Spider):
def __init__(self):
self.bro = webdriver.Chrome()
......
def closed(self, spider):
self.bro.quit()

下载中间件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
from scrapy import signals
from scrapy.http import HtmlResponse
from time import sleep
...

class WynewsDownloaderMiddleware:
def process_response(self, request, response, spider):
#拦截符合要求的响应对象
if request.url in spider.module_urls:
#使用selenium获取数据
bro = spider.bro
bro.get(request.url)
sleep(1)
#将新的响应数据返回给引擎
return HtmlResponse(url=request.url, body=bro.page_source, encoding="utf-8", request=request)
else:
return response

CrawlSpider全站数据爬取

  • 创建工程
  • 创建爬虫文件(CrawlSpider)
    scrapy genspider -t crawl [name] [url]
  • 爬虫文件
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    import scrapy
    from scrapy.linkextractors import LinkExtractor
    from scrapy.spiders import CrawlSpider, Rule

    class SunSpider(CrawlSpider):
    name = "sun"
    #allowed_domains = ["www.xxx.com"]
    start_urls = ["https://wz.sun0769.com/political/index/politicsNewest?id=1&page=1"]

    #allow是正则表达式
    #LinkExtractor根据指定的规则提取连接
    #满足的url返回的response会传给callback函数
    #follw表示是否递归爬取
    rules = (Rule(LinkExtractor(allow=r"id=1&page=\d+"), callback="parse_item", follow=True),)

    def parse_item(self, response):
    item = {}
    return item