文章

lxml使用指南

lxml使用指南

lxml 是 Python 中处理 XML 和 HTML 的强大库,结合了 libxml2 的高性能和 ElementTree 的易用性。以下是详细使用指南:

一、安装与基础

1
pip install lxml

二、XML 解析与操作

1. 解析 XML

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
from lxml import etree

# 从字符串解析
xml_string = """<root>
    <person id="1">
        <name>张三</name>
        <age>25</age>
    </person>
</root>"""
root = etree.fromstring(xml_string)

# 从文件解析
tree = etree.parse('data.xml')
root = tree.getroot()

# 从 URL 解析(需要网络)
# import requests
# response = requests.get('http://example.com/data.xml')
# root = etree.fromstring(response.content)

2. 遍历与查找

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
# 获取根标签名
print(root.tag)  # root

# 遍历子元素
for child in root:
    print(child.tag, child.attrib)  # person {'id': '1'}

# 查找第一个匹配元素
person = root.find('person')
name = root.find('person/name')  # 使用路径

# 查找所有匹配元素
all_persons = root.findall('person')

# 获取文本
name_text = person.find('name').text  # '张三'
age_text = person.find('age').text  # '25'
age_int = int(person.find('age').text)  # 转换为整数

# 获取属性
person_id = person.get('id')  # '1'

3. XPath 查询(更强大)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# 基本选择
names = root.xpath('//name')  # 所有name元素
names = root.xpath('//person/name/text()')  # 所有name的文本
ages = root.xpath('//person[@id="1"]/age')  # 带条件的查询

# 获取属性
ids = root.xpath('//person/@id')

# 使用谓词
first_person = root.xpath('//person[1]')  # 第一个person
adults = root.xpath('//person[age>18]')  # 年龄大于18的

# 在特定上下文查询
person = root.find('person')
name = person.xpath('./name/text()')[0]

4. 创建和修改 XML

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
# 创建新元素
root = etree.Element("root")
person = etree.SubElement(root, "person", id="1")
name = etree.SubElement(person, "name")
name.text = "李四"

# 添加元素
root.append(etree.Element("new_element"))

# 修改内容
root.find('person/name').text = "王五"
root.find('person').set('id', '2')

# 删除元素
person = root.find('person')
root.remove(person)

# 生成 XML 字符串
xml_str = etree.tostring(root, 
                         encoding='utf-8',
                         pretty_print=True,
                         xml_declaration=True)
print(xml_str.decode('utf-8'))

5. 处理命名空间

1
2
3
4
5
6
7
8
9
xml_ns = """<root xmlns:ns="http://example.com">
    <ns:person>张三</ns:person>
</root>"""
root = etree.fromstring(xml_ns)

# 定义命名空间映射
ns = {'ns': 'http://example.com'}
person = root.xpath('//ns:person', namespaces=ns)[0]
print(person.text)  # 张三

三、HTML 解析(Web 爬虫常用)

1. 解析 HTML

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
from lxml import html

# 从字符串解析
html_string = """
<html>
    <body>
        <div class="content">Hello World</div>
        <a href="/link">Click here</a>
    </body>
</html>"""
tree = html.fromstring(html_string)

# 从文件解析
tree = html.parse('page.html')

# 从 URL 解析
# tree = html.parse('http://example.com')

# 使用 requests + lxml
import requests
response = requests.get('http://example.com')
tree = html.fromstring(response.content)

2. CSS 选择器

1
2
3
4
5
6
7
8
9
10
11
12
# 按类名选择
elements = tree.cssselect('.content')
for el in elements:
    print(el.text_content())  # Hello World

# 按标签选择
links = tree.cssselect('a')
for link in links:
    print(link.get('href'), link.text)

# 组合选择
items = tree.cssselect('div.content > a.link')

3. 常用 HTML 提取方法

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
# 获取文本内容
text = tree.cssselect('.content')[0].text_content()

# 获取属性
href = tree.cssselect('a')[0].get('href')

# 获取所有文本
all_text = tree.xpath('//text()')

# 提取表格数据
table = tree.cssselect('table')[0]
rows = table.cssselect('tr')
for row in rows:
    cols = row.cssselect('td')
    row_data = [col.text_content().strip() for col in cols]
    print(row_data)

四、实际应用示例

示例1:解析 RSS 订阅

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
import requests
from lxml import etree

url = 'https://rss.example.com/feed.xml'
response = requests.get(url)
root = etree.fromstring(response.content)

# RSS 命名空间
ns = {'atom': 'http://www.w3.org/2005/Atom'}

items = root.xpath('//item')
for item in items:
    title = item.find('title').text
    link = item.find('link').text
    pub_date = item.find('pubDate').text
    print(f"{title}: {link} ({pub_date})")

示例2:网页数据抓取

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
import requests
from lxml import html

def scrape_quotes():
    url = 'http://quotes.toscrape.com'
    response = requests.get(url)
    tree = html.fromstring(response.content)
  
    quotes = tree.cssselect('.quote')
    for quote in quotes:
        text = quote.cssselect('.text')[0].text_content()
        author = quote.cssselect('.author')[0].text_content()
        tags = [tag.text_content() for tag in quote.cssselect('.tag')]
        print(f'"{text}" - {author}')
        print(f"Tags: {', '.join(tags)}\n")

scrape_quotes()

示例3:构建 XML 文档

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
from lxml import etree

# 创建 XML
root = etree.Element("catalog")

for i in range(3):
    book = etree.SubElement(root, "book", id=str(i+1))
    title = etree.SubElement(book, "title")
    title.text = f"Book {i+1}"
    author = etree.SubElement(book, "author")
    author.text = f"Author {i+1}"
    price = etree.SubElement(book, "price")
    price.text = str(19.99 + i)

# 保存到文件
tree = etree.ElementTree(root)
tree.write('books.xml', 
           encoding='utf-8',
           pretty_print=True,
           xml_declaration=True)

五、高级特性

1. 流式解析(大文件处理)

1
2
3
4
5
6
7
8
9
10
# 使用 iterparse 处理大文件
context = etree.iterparse('large.xml', events=('end',))
for event, elem in context:
    if elem.tag == 'record':
        # 处理记录
        print(elem.find('id').text)
        # 清理内存
        elem.clear()
        while elem.getprevious() is not None:
            del elem.getparent()[0]

2. XSLT 转换

1
2
3
4
5
6
7
8
9
10
11
12
13
xslt = etree.XML("""\
<xsl:stylesheet version="1.0"
    xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
    <xsl:template match="/">
        <html><body>
            <xsl:apply-templates/>
        </body></html>
    </xsl:template>
</xsl:stylesheet>""")

transform = etree.XSLT(xslt)
result = transform(tree)
print(str(result))

3. 验证 XML

1
2
3
4
5
6
7
# DTD 验证
dtd = etree.DTD(open('schema.dtd'))
is_valid = dtd.validate(tree)

# XML Schema 验证
xml_schema = etree.XMLSchema(etree.parse('schema.xsd'))
is_valid = xml_schema.validate(tree)

六、性能优化建议

  1. 使用 XPath 而非多次 find:XPath 通常更快
  2. 编译重复使用的 XPath
    1
    2
    
    find_name = etree.XPath("//person/name/text()")
    names = find_name(root)
    
  3. 大文件使用 iterparse
  4. 及时清理内存:处理完元素后调用 elem.clear()
  5. XPath使用细节详见 XPath匹配

七、常见问题解决

编码问题

1
2
3
4
5
6
# 指定编码
parser = etree.XMLParser(encoding='utf-8')
tree = etree.parse('file.xml', parser=parser)

# 修复非法字符
xml_string = xml_string.encode('utf-8', 'ignore')

处理不完整 HTML

1
2
3
4
from lxml import html, etree

parser = html.HTMLParser(remove_blank_text=True, remove_comments=True)
tree = html.fromstring(html_content, parser=parser)

lxml 功能强大,以上是核心用法的概览。根据具体需求选择合适的解析方法和查询方式。

本文由作者按照 CC BY 4.0 进行授权