网页数据解析三种方式

正则表达式

1
2
3
4
5
6
import re
import requests

resp = requests.get(url=url, headers=headers)
pattern = re.compile(r'正则表达式')
data = pattern.findall(resp.text)

bs4

1
2
3
4
5
6
7
8
import bs4
import requests

resp = requests.get(url=url, headers=headers)
# 创建BeautifulSoup对象
soup = bs4.BeautifulSoup(resp.text, 'lxml')
# 通过CSS选择器从页面中提取标签
data = soup.select("css选择器")

xpath

1
2
3
4
5
6
7
import requests
from lxml import etree

resp = requests.get(url=url, headers=headers)
tree = etree.HTML(resp.text)

data = tree.xpath("xpath路径")