Commit f8bc4f3e authored by nextime's avatar nextime

Add playwright_helper

parent 3b722898
# -*- coding: utf-8 -*-
from playwright.async_api import async_playwright, Browser, BrowserContext
import json
import shutil
import errno
from pathlib import Path
import asyncio
import uuid
from typing import List, Optional
import logging
logging.getLogger(__name__)
# Base directories for storing profile data and storage states (cross-platform)
BASE_PROFILE_DIR = Path("data/chromium_profiles") # Directory for user data/profile directories
BASE_STATE_DIR = Path("data/browser_states") # Directory for storage state JSON files
def copytree_ignore_errors(src: Path, dst: Path, ignore_patterns: list = ["SingletonLock", "SingletonCookie", "SingletonSocket"]):
"""Copy a directory tree, ignoring specified files and handling errors gracefully."""
# Define a function to ignore files matching specified patterns (e.g., Singleton* files)
def ignore_files(directory, files):
return [f for f in files if any(pattern in f for pattern in ignore_patterns)]
try:
# Copy the directory tree, ignoring specified files
shutil.copytree(src, dst, ignore=ignore_files, dirs_exist_ok=False)
except shutil.Error as e:
# Log non-critical copy errors (e.g., permission issues)
logging.info(f"Non-critical errors during copy: {e}")
except OSError as e:
# Handle specific OS errors (e.g., file not found, device busy) gracefully
if e.errno not in (errno.ENOENT, errno.EAGAIN, errno.EBUSY):
raise
logging.info(f"Skipped problematic file during copy: {e}")
class PersistentBrowser:
"""Simulates a Playwright Browser object using either launch_persistent_context or browser.launch."""
def __init__(self, playwright, base_profile_dir: Path, use_persistent_context: bool, extension_paths: List[Path] = None, **kwargs):
"""Initialize the simulated browser."""
# Store Playwright instance for browser operations
self._playwright = playwright
# Base directory for profile data
self._base_profile_dir = base_profile_dir
# Flag to choose between launch_persistent_context and browser.launch
self._use_persistent_context = use_persistent_context
# List of extension paths to copy to each context (converted to Path objects)
self._extension_paths = [Path(p) for p in (extension_paths or [])]
# Dictionary to store context_id -> BrowserContext
self._contexts = {}
# Browser instance for browser.launch mode
self._browser = None
# Flag to track if the browser is closed
self._is_closed = False
self._kwargs = kwargs
def contexts(self):
return self._browser.contexts
async def launch(self):
"""Simulate browser.launch() by preparing the environment."""
# Create base profile directory if it doesn't exist
self._base_profile_dir.mkdir(parents=True, exist_ok=True)
# If using browser.launch mode, launch a single Chromium instance
if not self._use_persistent_context:
self._browser = await self._playwright.chromium.launch(headless=False, **self._kwargs)
logging.info("Persistent browser launched.")
return self
async def new_context(self, context_id: str = None, **kwargs) -> BrowserContext:
"""Create a new context with its own persistent state."""
# Check if the browser is closed
if self._is_closed:
raise RuntimeError("Browser is closed.")
# Generate a unique context ID if not provided
context_id = context_id or str(uuid.uuid4())
# Create or restore profile directory for this context
profile_dir = self._get_profile_dir(context_id)
if not profile_dir.exists():
restore_profile_dir(context_id, self._base_profile_dir)
create_profile_dir(context_id, self._base_profile_dir)
# Create context based on mode
if self._use_persistent_context:
# Use launch_persistent_context for native persistence via user_data_dir
context = await self._playwright.chromium.launch_persistent_context(
user_data_dir=profile_dir,
headless=False,
viewport=kwargs.get("viewport", {"width": 1280, "height": 720}),
**{k: v for k, v in kwargs.items() if k != "extension_paths"} # Filter out extension_paths
)
else:
# Use browser.launch with manual persistence
if not self._browser:
raise RuntimeError("Browser not launched.")
if kwargs.get("viewport"):
vp = kwargs.get("viewport", {"width": 1280, "height": 720})
else:
vp = None
context = await self._browser.new_context(
viewport=vp,
**{k: v for k, v in kwargs.items() if k != "extension_paths" and k!="viewport"}
)
# Load storage state manually if it exists
state_file = BASE_STATE_DIR / f"browser_state_{context_id}.json"
if state_file.exists():
with open(state_file, 'r', encoding='utf-8') as f:
storage_state = json.load(f)
# Load cookies from storage state
if "cookies" in storage_state:
await context.add_cookies(storage_state["cookies"])
# Load local storage from storage state
if "origins" in storage_state:
for origin in storage_state["origins"]:
for item in origin.get("localStorage", []):
await context.evaluate(
"""
({name, value}) => {
window.localStorage.setItem(name, value);
}
""",
{"name": item["name"], "value": item["value"]}
)
logging.info(f"Loaded browser state for context {context_id} from {state_file}")
# Store the context
self._contexts[context_id] = context
logging.info(f"Created new context with ID {context_id} using profile {profile_dir}")
# Install pre-set extensions for this context
for ext_path in self._extension_paths:
await install_extension(context, ext_path, context_id)
return context
async def close(self):
"""Close all contexts and mark the browser as closed."""
if not self._is_closed:
for context_id, context in self._contexts.items():
# Save storage state before closing
await save_browser_state(context, context_id)
# Close context to release Singleton* files
await context.close()
# Backup profile directory after closing (only for persistent context mode)
if self._use_persistent_context:
backup_profile_dir(context_id, self._base_profile_dir)
# Close the browser if in browser.launch mode
if self._browser:
await self._browser.close()
self._contexts.clear()
self._is_closed = True
logging.info("Persistent browser closed.")
def _get_profile_dir(self, context_id: str) -> Path:
"""Get the profile directory for a context."""
return self._base_profile_dir / f"chromium_profile_{context_id}"
async def save_browser_state(context: BrowserContext, context_id: str):
"""Save the browser's storage state for a specific context."""
state_file = BASE_STATE_DIR / f"browser_state_{context_id}.json"
await context.storage_state(path=state_file)
logging.info(f"Saved browser state for context {context_id} to {state_file}")
def create_profile_dir(context_id: str, base_profile_dir: Path):
"""Create a profile directory for a specific context if it doesn't exist."""
profile_dir = base_profile_dir / f"chromium_profile_{context_id}"
profile_dir.mkdir(parents=True, exist_ok=True)
logging.info(f"Created profile directory for context {context_id} at {profile_dir}")
def backup_profile_dir(context_id: str, base_profile_dir: Path):
"""Copy the profile directory for a specific context to a backup location."""
profile_dir = base_profile_dir / f"chromium_profile_{context_id}"
backup_dir = base_profile_dir / f"chromium_profile_backup_{context_id}"
if profile_dir.exists():
if backup_dir.exists():
shutil.rmtree(backup_dir) # Remove old backup
copytree_ignore_errors(profile_dir, backup_dir)
logging.info(f"Backed up profile directory for context {context_id} to {backup_dir}")
def restore_profile_dir(context_id: str, base_profile_dir: Path):
"""Restore the profile directory for a specific context from the backup."""
profile_dir = base_profile_dir / f"chromium_profile_{context_id}"
backup_dir = base_profile_dir / f"chromium_profile_backup_{context_id}"
if backup_dir.exists():
if profile_dir.exists():
shutil.rmtree(profile_dir) # Remove current profile dir
copytree_ignore_errors(backup_dir, profile_dir)
logging.info(f"Restored profile directory for context {context_id} from {backup_dir}")
else:
logging.info(f"No backup profile directory found for context {context_id}.")
async def install_extension(context: BrowserContext, extension_path: Path, context_id: str):
"""Load an extension for a specific context."""
extension_path = Path(extension_path)
profile_dir = BASE_PROFILE_DIR / f"chromium_profile_{context_id}"
if extension_path.exists():
# Copy extension to the user data directory's Extensions folder
extension_dest = profile_dir / "Extensions" / extension_path.name
if not extension_dest.exists():
shutil.copytree(extension_path, extension_dest)
logging.info(f"Copied extension to {extension_dest} for context {context_id}")
# Add a script to simulate extension interaction (optional)
await context.add_init_script(
script="""chrome.runtime.sendMessage({ message: 'Extension loaded' });"""
)
logging.info(f"Extension loaded from {extension_path} for context {context_id}")
else:
logging.info(f"Extension path {extension_path} does not exist for context {context_id}.")
async def run_context(persistent_browser: PersistentBrowser, context_id: str, extension_paths: List[Path]):
"""Run a single browser context with its own persistent state."""
# Create a new context with the pre-set extensions
context = await persistent_browser.new_context(
context_id=context_id,
extension_paths=extension_paths
)
# Open a new page in the context
page = await context.new_page()
# Navigate to a sample website
await page.goto("https://example.com")
# Print the page title as an example action
logging.info(f"Page title for context {context_id}: {await page.title()}")
async def run(playwright, num_contexts=3, use_persistent_context=True):
"""Run multiple browser contexts with persistent states."""
# Ensure base directories exist
BASE_PROFILE_DIR.mkdir(parents=True, exist_ok=True)
BASE_STATE_DIR.mkdir(parents=True, exist_ok=True)
# Define pre-set extensions to be copied to all contexts
EXTENSION_PATHS = [
Path("./my_extension1"), # Example: use forward slashes for cross-platform compatibility
Path("./my_extension2") # Replace with actual paths to unpacked extension directories
]
# Create simulated browser with the chosen mode and extensions
persistent_browser = PersistentBrowser(playwright, BASE_PROFILE_DIR, use_persistent_context, EXTENSION_PATHS)
await persistent_browser.launch()
# Create tasks for multiple contexts
tasks = []
for _ in range(num_contexts):
context_id = str(uuid.uuid4()) # Unique ID for each context
tasks.append(run_context(persistent_browser, context_id, EXTENSION_PATHS))
# Run all contexts concurrently
await asyncio.gather(*tasks)
while True:
asyncio.sleep(0.1)
# Close the simulated browser, saving states and backing up profiles
await persistent_browser.close()
async def main():
"""Main entry point to test both modes."""
"""
HOW TO USE
----------
This script provides a unified solution for managing multiple Playwright browser contexts with persistent states (session data, extensions, cache) using two approaches:
1. launch_persistent_context mode: Uses Playwright's launch_persistent_context() for native persistence via user data directories.
2. browser.launch mode: Uses Playwright's browser.launch() with manual persistence via storage state files and profile directories.
Prerequisites:
- Install Playwright: Run "pip install playwright" in your terminal.
- Install Chromium: Run "playwright install chromium" to download the Chromium browser used by Playwright.
- Ensure Python 3.7+ is installed to support async/await syntax.
Setting Up Extensions:
- Update the EXTENSION_PATHS list in the run() function with paths to unpacked extension directories, each containing a manifest.json file.
- Example:
EXTENSION_PATHS = [
Path("./my_extension1"),
Path("./my_extension2")
]
- To obtain unpacked extensions:
- On Linux/macOS: Extract a .crx file using "unzip <extension>.crx".
- On Windows: Use 7-Zip to extract a .crx file (it is a ZIP archive).
- Alternatively, pre-install extensions in a Chromium browser and copy its profile directory (e.g., ~/.config/chromium on Linux, ~/Library/Application Support/Chromium on macOS, %LocalAppData%\\Chromium\\User Data on Windows) to chromium_profile_<context_id> for each context.
- Note: Extensions load natively in launch_persistent_context mode but may not load dynamically in browser.launch mode without additional setup (e.g., automating chrome://extensions/).
Running the Script:
- Save the script as script.py.
- Run it from the terminal: "python script.py".
- The script will:
- Create chromium_profiles and browser_states directories in the script's working directory.
- Run two tests:
1. launch_persistent_context mode: Creates 3 contexts, each with a unique user data directory (chromium_profile_<context_id>) and storage state file (browser_state_<context_id>.json).
2. browser.launch mode: Creates 3 contexts using a single browser instance, with manual persistence.
- For each context:
- Copies all extensions from EXTENSION_PATHS to the context's profile directory (chromium_profile_<context_id>/Extensions).
- Loads session data (cookies, local storage) from browser_state_<context_id>.json if it exists.
- Navigates to https://example.com and logging.infos the page title as a sample action.
- Saves session data and backs up the profile directory (in launch_persistent_context mode) when closing.
- Output will include logs for directory creation, extension copying, state loading/saving, and page titles.
Subsequent Runs:
- The script restores profile directories and session states for each context using unique context IDs (UUIDs), ensuring persistence across runs.
- Backed-up profile directories (chromium_profile_backup_<context_id>) are used to restore chromium_profile_<context_id> if needed.
- Session data persists in browser_state_<context_id>.json files.
Customization:
- Change the number of contexts by modifying num_contexts in run().
- Switch modes by setting use_persistent_context=True (launch_persistent_context) or False (browser.launch) in run().
- Update EXTENSION_PATHS with your extension directories.
- Add context options (e.g., user_agent, locale) by passing them to new_context(**kwargs) in run_context().
DEBUGGING TIPS
--------------
If you encounter issues while running the script, consider the following debugging strategies:
Extension Not Loading:
- Verify each path in EXTENSION_PATHS points to a valid unpacked extension directory containing a manifest.json file.
- Check the chromium_profile_<context_id>/Extensions directory to ensure extensions were copied correctly.
- In launch_persistent_context mode:
- Open the browser (since headless=False) and navigate to chrome://extensions/ to confirm extensions are loaded.
- Ensure extensions are compatible with Chromium and have a valid manifest.
- In browser.launch mode:
- Extensions may not load dynamically due to Playwright's limitations.
- To enable extensions, pre-install them in a Chromium browser and copy the profile to chromium_profile_<context_id>.
- Alternatively, automate extension installation by navigating to chrome://extensions/ and enabling developer mode (not implemented in this script).
- If extensions fail to load, check console logs for errors related to install_extension().
Session Data Issues:
- Ensure browser_state_<context_id>.json files in the browser_states directory contain valid JSON.
- Open these files to verify cookies and local storage data are correctly formatted.
- Check console output for messages like "Loaded browser state..." or "Saved browser state..." to confirm state handling.
- If session data does not persist, verify write permissions for the browser_states directory.
- Test by logging into a website in one run and checking if the login persists in the next run.
Backup Errors:
- The copytree_ignore_errors function prevents errors related to SingletonLock, SingletonCookie, and SingletonSocket files by ignoring them during backups.
- If backup issues persist, check console logs for messages like "Skipped problematic file..." or "Non-critical errors during copy...".
- Ensure no other processes (e.g., lingering Chromium instances) are accessing chromium_profile_<context_id> directories.
- Verify write permissions for the chromium_profiles directory.
- If backups fail, manually inspect chromium_profile_backup_<context_id> to ensure critical files (e.g., extensions, cache) were copied.
Indentation Errors:
- The code uses consistent 4-space indentation per PEP 8. If syntax errors occur, ensure no tabs are mixed with spaces.
- Copy the code directly to avoid indentation issues introduced by text editors or copy-paste operations.
- Use an editor with Python linting (e.g., VS Code with Pylance) to detect indentation problems.
Resource Usage:
- In launch_persistent_context mode, each context runs a separate Chromium instance, increasing memory and CPU usage. Monitor system resources if using many contexts.
- In browser.launch mode, a single browser instance is used, but extension/cache persistence is limited.
- If performance is an issue, reduce num_contexts or switch to browser.launch mode.
Other Errors:
- If you encounter runtime errors (e.g., network issues, Playwright exceptions), check the full traceback logging.infoed by the script.
- Share the error message and traceback for specific debugging assistance.
- Common issues include:
- Network errors: Ensure internet connectivity and that https://example.com is accessible.
- Playwright errors: Verify Playwright and Chromium are installed correctly.
- File permission errors: Run the script with appropriate permissions (e.g., as administrator on Windows if needed).
- To isolate issues, try running with num_contexts=1 and a single extension.
Logging:
- The script includes verbose logging for key actions (e.g., directory creation, extension copying, state loading/saving).
- Review console output to trace execution and identify where errors occur.
- Add custom logging.info statements in functions like new_context() or install_extension() for deeper debugging.
Testing:
- Test with a small number of contexts (e.g., num_contexts=1) to verify basic functionality.
- Test each mode separately by commenting out one call in main().
- Test with and without extensions to isolate extension-related issues.
- Run on different platforms (Linux, macOS, Windows) to confirm cross-platform compatibility.
"""
async with async_playwright() as playwright:
try:
# Test with launch_persistent_context mode
logging.info("Running with launch_persistent_context...")
await run(playwright, num_contexts=3, use_persistent_context=True)
# Test with browser.launch mode
logging.info("\nRunning with browser.launch...")
await run(playwright, num_contexts=3, use_persistent_context=False)
except Exception as e:
logging.info(f"An error occurred: {e}")
if __name__ == "__main__":
asyncio.run(main())
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment