Python使用Playwright实现浏览器自动化的完整指南
作者:夔嶷
一、前言
在当今的Web开发和测试领域,浏览器自动化工具扮演着越来越重要的角色。传统的Selenium虽然功能强大,但在处理现代Web应用时往往显得力不从心。今天,我要为大家介绍一个更加强大、现代化的浏览器自动化工具——Playwright,结合Python编程语言,让我们一起探索这个强大的工具。
Playwright是由Microsoft开发的开源浏览器自动化库,支持Chromium、Firefox和WebKit三大浏览器引擎。它具有速度快、功能丰富、API简洁等优点,特别适合现代Web应用的测试和自动化任务。
二、Playwright核心优势
在开始之前,让我们了解一下Playwright相比传统工具的优势:
- 跨浏览器支持:一套API控制Chromium、Firefox、WebKit
- 自动等待:智能等待元素出现,减少手动等待代码
- 网络拦截:轻松拦截和修改网络请求
- 文件下载/上传:原生支持文件操作
- 多语言支持:Python、JavaScript/TypeScript、Java、.NET
- 无头/有头模式:灵活切换调试和运行模式
- 移动端模拟:支持设备模拟和视口设置
三、环境搭建
3.1 安装Playwright
首先,我们需要安装Playwright库。打开终端,执行以下命令:
# 安装playwright库 pip install playwright # 安装浏览器驱动 python -m playwright install
3.2 验证安装
安装完成后,我们可以通过以下代码验证安装是否成功:
from playwright.sync_api import sync_playwright
def verify_installation():
with sync_playwright() as p:
browser = p.chromium.launch(headless=False)
page = browser.new_page()
page.goto("https://www.baidu.com")
print("当前页面标题:", page.title())
browser.close()
print("Playwright安装验证成功!")
if __name__ == "__main__":
verify_installation()
四、基础使用
4.1 同步API vs 异步API
Playwright提供了两种API风格:同步API和异步API。对于初学者,推荐使用同步API,代码更直观易懂。
同步API示例:
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch(headless=False) # headless=False 显示浏览器窗口
page = browser.new_page()
page.goto("https://www.example.com")
print(page.title())
browser.close()
异步API示例:
import asyncio
from playwright.async_api import async_playwright
async def main():
async with async_playwright() as p:
browser = await p.chromium.launch(headless=False)
page = await browser.new_page()
await page.goto("https://www.example.com")
print(await page.title())
await browser.close()
asyncio.run(main())
4.2 基本操作
from playwright.sync_api import sync_playwright
def basic_operations():
with sync_playwright() as p:
# 启动浏览器
browser = p.chromium.launch(headless=False)
page = browser.new_page()
# 导航到页面
page.goto("https://www.baidu.com")
# 输入搜索词
page.fill('input[name="wd"]', "Playwright教程")
# 点击搜索按钮
page.click('input[type="submit"]')
# 等待搜索结果
page.wait_for_selector('#content_left')
# 截图
page.screenshot(path="search_results.png")
# 获取页面内容
content = page.content()
print("页面内容长度:", len(content))
# 关闭浏览器
browser.close()
if __name__ == "__main__":
basic_operations()
五、高级功能
5.1 选择器进阶
Playwright支持多种选择器,包括CSS、XPath、文本选择器等:
from playwright.sync_api import sync_playwright
def advanced_selectors():
with sync_playwright() as p:
browser = p.chromium.launch(headless=False)
page = browser.new_page()
page.goto("https://www.example.com")
# CSS选择器
element = page.query_selector('div.container')
# XPath选择器
elements = page.query_selector_all('//div[@class="item"]')
# 文本选择器
button = page.get_by_text("点击这里")
# 组合选择器
login_button = page.get_by_role("button", name="登录")
print(f"找到 {len(elements)} 个元素")
browser.close()
if __name__ == "__main__":
advanced_selectors()
5.2 网络请求拦截
Playwright可以拦截和修改网络请求,这在测试API或模拟特定场景时非常有用:
from playwright.sync_api import sync_playwright
def intercept_requests():
with sync_playwright() as p:
browser = p.chromium.launch(headless=False)
page = browser.new_page()
# 拦截所有请求
page.route("**/*", lambda route: route.continue_())
# 拦截特定API请求
def handle_api_request(route):
if "api.example.com" in route.request.url:
route.fulfill(
status=200,
content_type="application/json",
body='{"status": "success", "data": "mocked data"}'
)
else:
route.continue_()
page.route("**/api/**", handle_api_request)
page.goto("https://www.example.com")
browser.close()
if __name__ == "__main__":
intercept_requests()
5.3 文件上传和下载
from playwright.sync_api import sync_playwright
import os
def file_operations():
with sync_playwright() as p:
browser = p.chromium.launch(headless=False)
page = browser.new_page()
# 文件上传
page.goto("https://filebin.net/")
with page.expect_file_chooser() as fc_info:
page.click('input[type="file"]')
file_chooser = fc_info.value
file_chooser.set_files("example.txt") # 上传本地文件
# 文件下载
download_path = os.path.join(os.getcwd(), "downloads")
os.makedirs(download_path, exist_ok=True)
page.on("download", lambda download: download.save_as(os.path.join(download_path, download.suggested_filename)))
# 触发下载
page.click('a[href="/download/example" rel="external nofollow" ]')
page.wait_for_timeout(3000) # 等待下载完成
browser.close()
if __name__ == "__main__":
file_operations()
六、实战案例:自动化登录测试
下面是一个完整的实战案例,演示如何使用Playwright进行网站登录测试:
from playwright.sync_api import sync_playwright
import time
class LoginTest:
def __init__(self):
self.base_url = "https://example.com/login"
def setup(self):
self.playwright = sync_playwright().start()
self.browser = self.playwright.chromium.launch(headless=False)
self.context = self.browser.new_context(
viewport={"width": 1200, "height": 800},
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
)
self.page = self.context.new_page()
def test_login_success(self, username, password):
try:
# 导航到登录页面
self.page.goto(self.base_url)
# 填写表单
self.page.fill('input[name="username"]', username)
self.page.fill('input[name="password"]', password)
# 点击登录按钮
self.page.click('button[type="submit"]')
# 等待登录成功
self.page.wait_for_selector('.welcome-message', timeout=5000)
# 验证登录成功
welcome_text = self.page.text_content('.welcome-message')
assert "欢迎" in welcome_text, "登录失败,未找到欢迎信息"
print(f"✅ 登录成功!欢迎信息: {welcome_text}")
# 截图保存
self.page.screenshot(path="login_success.png")
except Exception as e:
print(f"❌ 登录失败: {str(e)}")
self.page.screenshot(path="login_error.png")
raise
def test_login_failure(self, username, password):
try:
self.page.goto(self.base_url)
self.page.fill('input[name="username"]', username)
self.page.fill('input[name="password"]', password)
self.page.click('button[type="submit"]')
# 等待错误提示
self.page.wait_for_selector('.error-message', timeout=3000)
error_text = self.page.text_content('.error-message')
print(f"✅ 错误提示正确显示: {error_text}")
except Exception as e:
print(f"❌ 错误提示未显示: {str(e)}")
raise
def teardown(self):
self.browser.close()
self.playwright.stop()
if __name__ == "__main__":
test = LoginTest()
test.setup()
try:
# 测试成功登录
test.test_login_success("testuser", "correctpassword123")
# 测试失败登录
test.test_login_failure("wronguser", "wrongpassword")
finally:
test.teardown()
print("测试完成!")
七、最佳实践
7.1 代码组织结构
建议将Playwright代码组织成以下结构:
project/
├── pages/ # 页面对象模型
│ ├── login_page.py
│ └── dashboard_page.py
├── tests/ # 测试用例
│ ├── test_login.py
│ └── test_dashboard.py
├── utils/ # 工具函数
│ ├── helpers.py
│ └── config.py
├── screenshots/ # 截图保存
└── requirements.txt # 依赖管理
7.2 页面对象模型(POM)
使用页面对象模型可以提高代码的可维护性:
# pages/login_page.py
class LoginPage:
def __init__(self, page):
self.page = page
self.username_input = 'input[name="username"]'
self.password_input = 'input[name="password"]'
self.login_button = 'button[type="submit"]'
self.error_message = '.error-message'
def navigate(self):
self.page.goto("https://example.com/login")
def login(self, username, password):
self.page.fill(self.username_input, username)
self.page.fill(self.password_input, password)
self.page.click(self.login_button)
def get_error_message(self):
return self.page.text_content(self.error_message) if self.page.is_visible(self.error_message) else None
7.3 配置管理
# utils/config.py
import os
from dotenv import load_dotenv
load_dotenv()
class Config:
BASE_URL = os.getenv("BASE_URL", "https://example.com")
BROWSER_TYPE = os.getenv("BROWSER_TYPE", "chromium")
HEADLESS = os.getenv("HEADLESS", "false").lower() == "true"
TIMEOUT = int(os.getenv("TIMEOUT", "30000"))
@classmethod
def get_browser_args(cls):
return {
"headless": cls.HEADLESS,
"slow_mo": 50 if not cls.HEADLESS else 0,
"args": ["--start-maximized"] if not cls.HEADLESS else []
}
八、常见问题解决
8.1 元素找不到的处理
from playwright.sync_api import TimeoutError
def safe_find_element(page, selector, timeout=5000):
try:
element = page.wait_for_selector(selector, timeout=timeout)
return element
except TimeoutError:
print(f"⚠️ 元素未找到: {selector}")
page.screenshot(path=f"element_not_found_{int(time.time())}.png")
return None
8.2 等待策略优化
# 不推荐:固定等待
page.wait_for_timeout(2000)
# 推荐:智能等待
page.wait_for_selector('#result', state='visible')
page.wait_for_function('() => document.readyState === "complete"')
page.wait_for_response('**/api/data**')
8.3 性能优化
# 启动时禁用不必要的功能
browser = p.chromium.launch(
headless=True,
args=[
'--no-sandbox',
'--disable-dev-shm-usage',
'--disable-gpu',
'--disable-software-rasterizer'
]
)
# 使用context reuse
context = browser.new_context()
page1 = context.new_page()
page2 = context.new_page() # 复用同一个context
九、总结
Playwright+Python是一个强大的组合,为Web自动化测试和爬虫开发提供了现代化的解决方案。通过本教程,我们学习了:
- 环境搭建:快速安装和配置Playwright
- 基础操作:页面导航、元素交互、截图等
- 高级功能:网络拦截、文件操作、设备模拟
- 实战案例:完整的登录测试流程
- 最佳实践:代码组织、页面对象模型、配置管理
Playwright的优势在于其现代化的API设计、强大的功能和优秀的性能。相比传统的Selenium,它在处理现代Web应用时更加得心应手,特别是在处理SPA(单页面应用)、WebSocket、文件上传下载等场景时表现尤为出色。
到此这篇关于Python使用Playwright实现浏览器自动化的完整指南的文章就介绍到这了,更多相关Python Playwright浏览器自动化内容请搜索脚本之家以前的文章或继续浏览下面的相关文章希望大家以后多多支持脚本之家!
