lxml 是 Python 中处理 XML 和 HTML 的强大库,结合了 libxml2 的高性能和 ElementTree 的易用性。以下是详细使用指南:
一、安装与基础
二、XML 解析与操作
1. 解析 XML
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
| from lxml import etree
# 从字符串解析
xml_string = """<root>
<person id="1">
<name>张三</name>
<age>25</age>
</person>
</root>"""
root = etree.fromstring(xml_string)
# 从文件解析
tree = etree.parse('data.xml')
root = tree.getroot()
# 从 URL 解析(需要网络)
# import requests
# response = requests.get('http://example.com/data.xml')
# root = etree.fromstring(response.content)
|
2. 遍历与查找
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
| # 获取根标签名
print(root.tag) # root
# 遍历子元素
for child in root:
print(child.tag, child.attrib) # person {'id': '1'}
# 查找第一个匹配元素
person = root.find('person')
name = root.find('person/name') # 使用路径
# 查找所有匹配元素
all_persons = root.findall('person')
# 获取文本
name_text = person.find('name').text # '张三'
age_text = person.find('age').text # '25'
age_int = int(person.find('age').text) # 转换为整数
# 获取属性
person_id = person.get('id') # '1'
|
3. XPath 查询(更强大)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
| # 基本选择
names = root.xpath('//name') # 所有name元素
names = root.xpath('//person/name/text()') # 所有name的文本
ages = root.xpath('//person[@id="1"]/age') # 带条件的查询
# 获取属性
ids = root.xpath('//person/@id')
# 使用谓词
first_person = root.xpath('//person[1]') # 第一个person
adults = root.xpath('//person[age>18]') # 年龄大于18的
# 在特定上下文查询
person = root.find('person')
name = person.xpath('./name/text()')[0]
|
4. 创建和修改 XML
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
| # 创建新元素
root = etree.Element("root")
person = etree.SubElement(root, "person", id="1")
name = etree.SubElement(person, "name")
name.text = "李四"
# 添加元素
root.append(etree.Element("new_element"))
# 修改内容
root.find('person/name').text = "王五"
root.find('person').set('id', '2')
# 删除元素
person = root.find('person')
root.remove(person)
# 生成 XML 字符串
xml_str = etree.tostring(root,
encoding='utf-8',
pretty_print=True,
xml_declaration=True)
print(xml_str.decode('utf-8'))
|
5. 处理命名空间
1
2
3
4
5
6
7
8
9
| xml_ns = """<root xmlns:ns="http://example.com">
<ns:person>张三</ns:person>
</root>"""
root = etree.fromstring(xml_ns)
# 定义命名空间映射
ns = {'ns': 'http://example.com'}
person = root.xpath('//ns:person', namespaces=ns)[0]
print(person.text) # 张三
|
三、HTML 解析(Web 爬虫常用)
1. 解析 HTML
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
| from lxml import html
# 从字符串解析
html_string = """
<html>
<body>
<div class="content">Hello World</div>
<a href="/link">Click here</a>
</body>
</html>"""
tree = html.fromstring(html_string)
# 从文件解析
tree = html.parse('page.html')
# 从 URL 解析
# tree = html.parse('http://example.com')
# 使用 requests + lxml
import requests
response = requests.get('http://example.com')
tree = html.fromstring(response.content)
|
2. CSS 选择器
1
2
3
4
5
6
7
8
9
10
11
12
| # 按类名选择
elements = tree.cssselect('.content')
for el in elements:
print(el.text_content()) # Hello World
# 按标签选择
links = tree.cssselect('a')
for link in links:
print(link.get('href'), link.text)
# 组合选择
items = tree.cssselect('div.content > a.link')
|
3. 常用 HTML 提取方法
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
| # 获取文本内容
text = tree.cssselect('.content')[0].text_content()
# 获取属性
href = tree.cssselect('a')[0].get('href')
# 获取所有文本
all_text = tree.xpath('//text()')
# 提取表格数据
table = tree.cssselect('table')[0]
rows = table.cssselect('tr')
for row in rows:
cols = row.cssselect('td')
row_data = [col.text_content().strip() for col in cols]
print(row_data)
|
四、实际应用示例
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
| import requests
from lxml import etree
url = 'https://rss.example.com/feed.xml'
response = requests.get(url)
root = etree.fromstring(response.content)
# RSS 命名空间
ns = {'atom': 'http://www.w3.org/2005/Atom'}
items = root.xpath('//item')
for item in items:
title = item.find('title').text
link = item.find('link').text
pub_date = item.find('pubDate').text
print(f"{title}: {link} ({pub_date})")
|
示例2:网页数据抓取
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
| import requests
from lxml import html
def scrape_quotes():
url = 'http://quotes.toscrape.com'
response = requests.get(url)
tree = html.fromstring(response.content)
quotes = tree.cssselect('.quote')
for quote in quotes:
text = quote.cssselect('.text')[0].text_content()
author = quote.cssselect('.author')[0].text_content()
tags = [tag.text_content() for tag in quote.cssselect('.tag')]
print(f'"{text}" - {author}')
print(f"Tags: {', '.join(tags)}\n")
scrape_quotes()
|
示例3:构建 XML 文档
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
| from lxml import etree
# 创建 XML
root = etree.Element("catalog")
for i in range(3):
book = etree.SubElement(root, "book", id=str(i+1))
title = etree.SubElement(book, "title")
title.text = f"Book {i+1}"
author = etree.SubElement(book, "author")
author.text = f"Author {i+1}"
price = etree.SubElement(book, "price")
price.text = str(19.99 + i)
# 保存到文件
tree = etree.ElementTree(root)
tree.write('books.xml',
encoding='utf-8',
pretty_print=True,
xml_declaration=True)
|
五、高级特性
1. 流式解析(大文件处理)
1
2
3
4
5
6
7
8
9
10
| # 使用 iterparse 处理大文件
context = etree.iterparse('large.xml', events=('end',))
for event, elem in context:
if elem.tag == 'record':
# 处理记录
print(elem.find('id').text)
# 清理内存
elem.clear()
while elem.getprevious() is not None:
del elem.getparent()[0]
|
2. XSLT 转换
1
2
3
4
5
6
7
8
9
10
11
12
13
| xslt = etree.XML("""\
<xsl:stylesheet version="1.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:template match="/">
<html><body>
<xsl:apply-templates/>
</body></html>
</xsl:template>
</xsl:stylesheet>""")
transform = etree.XSLT(xslt)
result = transform(tree)
print(str(result))
|
3. 验证 XML
1
2
3
4
5
6
7
| # DTD 验证
dtd = etree.DTD(open('schema.dtd'))
is_valid = dtd.validate(tree)
# XML Schema 验证
xml_schema = etree.XMLSchema(etree.parse('schema.xsd'))
is_valid = xml_schema.validate(tree)
|
六、性能优化建议
- 使用 XPath 而非多次 find:XPath 通常更快
- 编译重复使用的 XPath
1
2
| find_name = etree.XPath("//person/name/text()")
names = find_name(root)
|
- 大文件使用 iterparse
- 及时清理内存:处理完元素后调用
elem.clear()
- XPath使用细节详见 XPath匹配
七、常见问题解决
编码问题
1
2
3
4
5
6
| # 指定编码
parser = etree.XMLParser(encoding='utf-8')
tree = etree.parse('file.xml', parser=parser)
# 修复非法字符
xml_string = xml_string.encode('utf-8', 'ignore')
|
处理不完整 HTML
1
2
3
4
| from lxml import html, etree
parser = html.HTMLParser(remove_blank_text=True, remove_comments=True)
tree = html.fromstring(html_content, parser=parser)
|
lxml 功能强大,以上是核心用法的概览。根据具体需求选择合适的解析方法和查询方式。