Add playwright_helper

f8bc4f3e · nextime · 3b722898 · f8bc4f3e
Commit f8bc4f3e authored Jun 24, 2025 by nextime
Show whitespace changes
Inline Side-by-side

Showing with 389 additions and 0 deletions

playwright_helper.py shmcs/playwright_helper.py +389 -0

No files found.
--- a/shmcs/playwright_helper.py
+++ b/shmcs/playwright_helper.py
+# -*- coding: utf-8 -*-
+from playwright.async_api import async_playwright, Browser, BrowserContext
+import json
+import shutil
+import errno
+from pathlib import Path
+import asyncio
+import uuid
+from typing import List, Optional
+
+import logging
+
+logging.getLogger(__name__)
+
+
+# Base directories for storing profile data and storage states (cross-platform)
+BASE_PROFILE_DIR = Path("data/chromium_profiles")  # Directory for user data/profile directories
+BASE_STATE_DIR = Path("data/browser_states")      # Directory for storage state JSON files
+
+def copytree_ignore_errors(src: Path, dst: Path, ignore_patterns: list = ["SingletonLock", "SingletonCookie", "SingletonSocket"]):
+    """Copy a directory tree, ignoring specified files and handling errors gracefully."""
+    # Define a function to ignore files matching specified patterns (e.g., Singleton* files)
+    def ignore_files(directory, files):
+        return [f for f in files if any(pattern in f for pattern in ignore_patterns)]
+    
+    try:
+        # Copy the directory tree, ignoring specified files
+        shutil.copytree(src, dst, ignore=ignore_files, dirs_exist_ok=False)
+    except shutil.Error as e:
+        # Log non-critical copy errors (e.g., permission issues)
+        logging.info(f"Non-critical errors during copy: {e}")
+    except OSError as e:
+        # Handle specific OS errors (e.g., file not found, device busy) gracefully
+        if e.errno not in (errno.ENOENT, errno.EAGAIN, errno.EBUSY):
+            raise
+        logging.info(f"Skipped problematic file during copy: {e}")
+
+class PersistentBrowser:
+    """Simulates a Playwright Browser object using either launch_persistent_context or browser.launch."""
+    
+    def __init__(self, playwright, base_profile_dir: Path, use_persistent_context: bool, extension_paths: List[Path] = None, **kwargs):
+        """Initialize the simulated browser."""
+        # Store Playwright instance for browser operations
+        self._playwright = playwright
+        # Base directory for profile data
+        self._base_profile_dir = base_profile_dir
+        # Flag to choose between launch_persistent_context and browser.launch
+        self._use_persistent_context = use_persistent_context
+        # List of extension paths to copy to each context (converted to Path objects)
+        self._extension_paths = [Path(p) for p in (extension_paths or [])]
+        # Dictionary to store context_id -> BrowserContext
+        self._contexts = {}
+        # Browser instance for browser.launch mode
+        self._browser = None
+        # Flag to track if the browser is closed
+        self._is_closed = False
+
+        self._kwargs = kwargs
+
+    def contexts(self):
+       return self._browser.contexts
+
+    async def launch(self):
+        """Simulate browser.launch() by preparing the environment."""
+        # Create base profile directory if it doesn't exist
+        self._base_profile_dir.mkdir(parents=True, exist_ok=True)
+        # If using browser.launch mode, launch a single Chromium instance
+        if not self._use_persistent_context:
+            self._browser = await self._playwright.chromium.launch(headless=False, **self._kwargs)
+        logging.info("Persistent browser launched.")
+        return self
+
+    async def new_context(self, context_id: str = None, **kwargs) -> BrowserContext:
+        """Create a new context with its own persistent state."""
+        # Check if the browser is closed
+        if self._is_closed:
+            raise RuntimeError("Browser is closed.")
+
+        # Generate a unique context ID if not provided
+        context_id = context_id or str(uuid.uuid4())
+        
+        # Create or restore profile directory for this context
+        profile_dir = self._get_profile_dir(context_id)
+        if not profile_dir.exists():
+            restore_profile_dir(context_id, self._base_profile_dir)
+            create_profile_dir(context_id, self._base_profile_dir)
+
+        # Create context based on mode
+        if self._use_persistent_context:
+            # Use launch_persistent_context for native persistence via user_data_dir
+            context = await self._playwright.chromium.launch_persistent_context(
+                user_data_dir=profile_dir,
+                headless=False,
+                viewport=kwargs.get("viewport", {"width": 1280, "height": 720}),
+                **{k: v for k, v in kwargs.items() if k != "extension_paths"}  # Filter out extension_paths
+            )
+        else:
+            # Use browser.launch with manual persistence
+            if not self._browser:
+                raise RuntimeError("Browser not launched.")
+            if kwargs.get("viewport"):
+               vp = kwargs.get("viewport", {"width": 1280, "height": 720})
+            else:
+               vp = None
+            context = await self._browser.new_context(
+                viewport=vp,
+                **{k: v for k, v in kwargs.items() if k != "extension_paths" and k!="viewport"}
+            )
+
+        # Load storage state manually if it exists
+        state_file = BASE_STATE_DIR / f"browser_state_{context_id}.json"
+        if state_file.exists():
+            with open(state_file, 'r', encoding='utf-8') as f:
+                storage_state = json.load(f)
+                # Load cookies from storage state
+                if "cookies" in storage_state:
+                    await context.add_cookies(storage_state["cookies"])
+                # Load local storage from storage state
+                if "origins" in storage_state:
+                    for origin in storage_state["origins"]:
+                        for item in origin.get("localStorage", []):
+                            await context.evaluate(
+                                """
+                                ({name, value}) => {
+                                    window.localStorage.setItem(name, value);
+                                }
+                                """,
+                                {"name": item["name"], "value": item["value"]}
+                            )
+            logging.info(f"Loaded browser state for context {context_id} from {state_file}")
+
+        # Store the context
+        self._contexts[context_id] = context
+        logging.info(f"Created new context with ID {context_id} using profile {profile_dir}")
+
+        # Install pre-set extensions for this context
+        for ext_path in self._extension_paths:
+            await install_extension(context, ext_path, context_id)
+
+        return context
+
+    async def close(self):
+        """Close all contexts and mark the browser as closed."""
+        if not self._is_closed:
+            for context_id, context in self._contexts.items():
+                # Save storage state before closing
+                await save_browser_state(context, context_id)
+                # Close context to release Singleton* files
+                await context.close()
+                # Backup profile directory after closing (only for persistent context mode)
+                if self._use_persistent_context:
+                    backup_profile_dir(context_id, self._base_profile_dir)
+            # Close the browser if in browser.launch mode
+            if self._browser:
+                await self._browser.close()
+            self._contexts.clear()
+            self._is_closed = True
+            logging.info("Persistent browser closed.")
+
+    def _get_profile_dir(self, context_id: str) -> Path:
+        """Get the profile directory for a context."""
+        return self._base_profile_dir / f"chromium_profile_{context_id}"
+
+async def save_browser_state(context: BrowserContext, context_id: str):
+    """Save the browser's storage state for a specific context."""
+    state_file = BASE_STATE_DIR / f"browser_state_{context_id}.json"
+    await context.storage_state(path=state_file)
+    logging.info(f"Saved browser state for context {context_id} to {state_file}")
+
+def create_profile_dir(context_id: str, base_profile_dir: Path):
+    """Create a profile directory for a specific context if it doesn't exist."""
+    profile_dir = base_profile_dir / f"chromium_profile_{context_id}"
+    profile_dir.mkdir(parents=True, exist_ok=True)
+    logging.info(f"Created profile directory for context {context_id} at {profile_dir}")
+
+def backup_profile_dir(context_id: str, base_profile_dir: Path):
+    """Copy the profile directory for a specific context to a backup location."""
+    profile_dir = base_profile_dir / f"chromium_profile_{context_id}"
+    backup_dir = base_profile_dir / f"chromium_profile_backup_{context_id}"
+    if profile_dir.exists():
+        if backup_dir.exists():
+            shutil.rmtree(backup_dir)  # Remove old backup
+        copytree_ignore_errors(profile_dir, backup_dir)
+        logging.info(f"Backed up profile directory for context {context_id} to {backup_dir}")
+
+def restore_profile_dir(context_id: str, base_profile_dir: Path):
+    """Restore the profile directory for a specific context from the backup."""
+    profile_dir = base_profile_dir / f"chromium_profile_{context_id}"
+    backup_dir = base_profile_dir / f"chromium_profile_backup_{context_id}"
+    if backup_dir.exists():
+        if profile_dir.exists():
+            shutil.rmtree(profile_dir)  # Remove current profile dir
+        copytree_ignore_errors(backup_dir, profile_dir)
+        logging.info(f"Restored profile directory for context {context_id} from {backup_dir}")
+    else:
+        logging.info(f"No backup profile directory found for context {context_id}.")
+
+async def install_extension(context: BrowserContext, extension_path: Path, context_id: str):
+    """Load an extension for a specific context."""
+    extension_path = Path(extension_path)
+    profile_dir = BASE_PROFILE_DIR / f"chromium_profile_{context_id}"
+    if extension_path.exists():
+        # Copy extension to the user data directory's Extensions folder
+        extension_dest = profile_dir / "Extensions" / extension_path.name
+        if not extension_dest.exists():
+            shutil.copytree(extension_path, extension_dest)
+            logging.info(f"Copied extension to {extension_dest} for context {context_id}")
+        # Add a script to simulate extension interaction (optional)
+        await context.add_init_script(
+            script="""chrome.runtime.sendMessage({ message: 'Extension loaded' });"""
+        )
+        logging.info(f"Extension loaded from {extension_path} for context {context_id}")
+    else:
+        logging.info(f"Extension path {extension_path} does not exist for context {context_id}.")
+
+async def run_context(persistent_browser: PersistentBrowser, context_id: str, extension_paths: List[Path]):
+    """Run a single browser context with its own persistent state."""
+    # Create a new context with the pre-set extensions
+    context = await persistent_browser.new_context(
+        context_id=context_id,
+        extension_paths=extension_paths
+    )
+
+    # Open a new page in the context
+    page = await context.new_page()
+
+    # Navigate to a sample website
+    await page.goto("https://example.com")
+
+    # Print the page title as an example action
+    logging.info(f"Page title for context {context_id}: {await page.title()}")
+
+async def run(playwright, num_contexts=3, use_persistent_context=True):
+    """Run multiple browser contexts with persistent states."""
+    # Ensure base directories exist
+    BASE_PROFILE_DIR.mkdir(parents=True, exist_ok=True)
+    BASE_STATE_DIR.mkdir(parents=True, exist_ok=True)
+
+    # Define pre-set extensions to be copied to all contexts
+    EXTENSION_PATHS = [
+        Path("./my_extension1"),  # Example: use forward slashes for cross-platform compatibility
+        Path("./my_extension2")   # Replace with actual paths to unpacked extension directories
+    ]
+
+    # Create simulated browser with the chosen mode and extensions
+    persistent_browser = PersistentBrowser(playwright, BASE_PROFILE_DIR, use_persistent_context, EXTENSION_PATHS)
+    await persistent_browser.launch()
+
+    # Create tasks for multiple contexts
+    tasks = []
+    for _ in range(num_contexts):
+        context_id = str(uuid.uuid4())  # Unique ID for each context
+        tasks.append(run_context(persistent_browser, context_id, EXTENSION_PATHS))
+
+    # Run all contexts concurrently
+    await asyncio.gather(*tasks)
+
+    while True:
+       asyncio.sleep(0.1)
+    # Close the simulated browser, saving states and backing up profiles
+    await persistent_browser.close()
+
+async def main():
+    """Main entry point to test both modes."""
+    """
+    HOW TO USE
+    ----------
+    This script provides a unified solution for managing multiple Playwright browser contexts with persistent states (session data, extensions, cache) using two approaches:
+    1. launch_persistent_context mode: Uses Playwright's launch_persistent_context() for native persistence via user data directories.
+    2. browser.launch mode: Uses Playwright's browser.launch() with manual persistence via storage state files and profile directories.
+
+    Prerequisites:
+    - Install Playwright: Run "pip install playwright" in your terminal.
+    - Install Chromium: Run "playwright install chromium" to download the Chromium browser used by Playwright.
+    - Ensure Python 3.7+ is installed to support async/await syntax.
+
+    Setting Up Extensions:
+    - Update the EXTENSION_PATHS list in the run() function with paths to unpacked extension directories, each containing a manifest.json file.
+    - Example:
+        EXTENSION_PATHS = [
+            Path("./my_extension1"),
+            Path("./my_extension2")
+        ]
+    - To obtain unpacked extensions:
+        - On Linux/macOS: Extract a .crx file using "unzip <extension>.crx".
+        - On Windows: Use 7-Zip to extract a .crx file (it is a ZIP archive).
+    - Alternatively, pre-install extensions in a Chromium browser and copy its profile directory (e.g., ~/.config/chromium on Linux, ~/Library/Application Support/Chromium on macOS, %LocalAppData%\\Chromium\\User Data on Windows) to chromium_profile_<context_id> for each context.
+    - Note: Extensions load natively in launch_persistent_context mode but may not load dynamically in browser.launch mode without additional setup (e.g., automating chrome://extensions/).
+
+    Running the Script:
+    - Save the script as script.py.
+    - Run it from the terminal: "python script.py".
+    - The script will:
+        - Create chromium_profiles and browser_states directories in the script's working directory.
+        - Run two tests:
+            1. launch_persistent_context mode: Creates 3 contexts, each with a unique user data directory (chromium_profile_<context_id>) and storage state file (browser_state_<context_id>.json).
+            2. browser.launch mode: Creates 3 contexts using a single browser instance, with manual persistence.
+        - For each context:
+            - Copies all extensions from EXTENSION_PATHS to the context's profile directory (chromium_profile_<context_id>/Extensions).
+            - Loads session data (cookies, local storage) from browser_state_<context_id>.json if it exists.
+            - Navigates to https://example.com and logging.infos the page title as a sample action.
+            - Saves session data and backs up the profile directory (in launch_persistent_context mode) when closing.
+    - Output will include logs for directory creation, extension copying, state loading/saving, and page titles.
+
+    Subsequent Runs:
+    - The script restores profile directories and session states for each context using unique context IDs (UUIDs), ensuring persistence across runs.
+    - Backed-up profile directories (chromium_profile_backup_<context_id>) are used to restore chromium_profile_<context_id> if needed.
+    - Session data persists in browser_state_<context_id>.json files.
+
+    Customization:
+    - Change the number of contexts by modifying num_contexts in run().
+    - Switch modes by setting use_persistent_context=True (launch_persistent_context) or False (browser.launch) in run().
+    - Update EXTENSION_PATHS with your extension directories.
+    - Add context options (e.g., user_agent, locale) by passing them to new_context(**kwargs) in run_context().
+
+    DEBUGGING TIPS
+    --------------
+    If you encounter issues while running the script, consider the following debugging strategies:
+
+    Extension Not Loading:
+    - Verify each path in EXTENSION_PATHS points to a valid unpacked extension directory containing a manifest.json file.
+    - Check the chromium_profile_<context_id>/Extensions directory to ensure extensions were copied correctly.
+    - In launch_persistent_context mode:
+        - Open the browser (since headless=False) and navigate to chrome://extensions/ to confirm extensions are loaded.
+        - Ensure extensions are compatible with Chromium and have a valid manifest.
+    - In browser.launch mode:
+        - Extensions may not load dynamically due to Playwright's limitations.
+        - To enable extensions, pre-install them in a Chromium browser and copy the profile to chromium_profile_<context_id>.
+        - Alternatively, automate extension installation by navigating to chrome://extensions/ and enabling developer mode (not implemented in this script).
+    - If extensions fail to load, check console logs for errors related to install_extension().
+
+    Session Data Issues:
+    - Ensure browser_state_<context_id>.json files in the browser_states directory contain valid JSON.
+    - Open these files to verify cookies and local storage data are correctly formatted.
+    - Check console output for messages like "Loaded browser state..." or "Saved browser state..." to confirm state handling.
+    - If session data does not persist, verify write permissions for the browser_states directory.
+    - Test by logging into a website in one run and checking if the login persists in the next run.
+
+    Backup Errors:
+    - The copytree_ignore_errors function prevents errors related to SingletonLock, SingletonCookie, and SingletonSocket files by ignoring them during backups.
+    - If backup issues persist, check console logs for messages like "Skipped problematic file..." or "Non-critical errors during copy...".
+    - Ensure no other processes (e.g., lingering Chromium instances) are accessing chromium_profile_<context_id> directories.
+    - Verify write permissions for the chromium_profiles directory.
+    - If backups fail, manually inspect chromium_profile_backup_<context_id> to ensure critical files (e.g., extensions, cache) were copied.
+
+    Indentation Errors:
+    - The code uses consistent 4-space indentation per PEP 8. If syntax errors occur, ensure no tabs are mixed with spaces.
+    - Copy the code directly to avoid indentation issues introduced by text editors or copy-paste operations.
+    - Use an editor with Python linting (e.g., VS Code with Pylance) to detect indentation problems.
+
+    Resource Usage:
+    - In launch_persistent_context mode, each context runs a separate Chromium instance, increasing memory and CPU usage. Monitor system resources if using many contexts.
+    - In browser.launch mode, a single browser instance is used, but extension/cache persistence is limited.
+    - If performance is an issue, reduce num_contexts or switch to browser.launch mode.
+
+    Other Errors:
+    - If you encounter runtime errors (e.g., network issues, Playwright exceptions), check the full traceback logging.infoed by the script.
+    - Share the error message and traceback for specific debugging assistance.
+    - Common issues include:
+        - Network errors: Ensure internet connectivity and that https://example.com is accessible.
+        - Playwright errors: Verify Playwright and Chromium are installed correctly.
+        - File permission errors: Run the script with appropriate permissions (e.g., as administrator on Windows if needed).
+    - To isolate issues, try running with num_contexts=1 and a single extension.
+
+    Logging:
+    - The script includes verbose logging for key actions (e.g., directory creation, extension copying, state loading/saving).
+    - Review console output to trace execution and identify where errors occur.
+    - Add custom logging.info statements in functions like new_context() or install_extension() for deeper debugging.
+
+    Testing:
+    - Test with a small number of contexts (e.g., num_contexts=1) to verify basic functionality.
+    - Test each mode separately by commenting out one call in main().
+    - Test with and without extensions to isolate extension-related issues.
+    - Run on different platforms (Linux, macOS, Windows) to confirm cross-platform compatibility.
+    """
+    async with async_playwright() as playwright:
+        try:
+            # Test with launch_persistent_context mode
+            logging.info("Running with launch_persistent_context...")
+            await run(playwright, num_contexts=3, use_persistent_context=True)
+            
+            # Test with browser.launch mode
+            logging.info("\nRunning with browser.launch...")
+            await run(playwright, num_contexts=3, use_persistent_context=False)
+        except Exception as e:
+            logging.info(f"An error occurred: {e}")
+
+if __name__ == "__main__":
+    asyncio.run(main())