Spaces:
Sleeping
Sleeping
| import os, sys | |
| parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) | |
| sys.path.append(parent_dir) | |
| __location__ = os.path.realpath( os.path.join(os.getcwd(), os.path.dirname(__file__))) | |
| import os, sys | |
| import asyncio | |
| from crawl4ai import AsyncWebCrawler, CacheMode | |
| from crawl4ai.content_filter_strategy import PruningContentFilter | |
| from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator | |
| # Assuming that the changes made allow different configurations | |
| # for managed browser, persistent context, and so forth. | |
| async def test_default_headless(): | |
| async with AsyncWebCrawler( | |
| headless=True, | |
| verbose=True, | |
| user_agent_mode="random", | |
| user_agent_generator_config={"device_type": "mobile", "os_type": "android"}, | |
| use_managed_browser=False, | |
| use_persistent_context=False, | |
| ignore_https_errors=True, | |
| # Testing normal ephemeral context | |
| ) as crawler: | |
| result = await crawler.arun( | |
| url='https://www.kidocode.com/degrees/technology', | |
| cache_mode=CacheMode.BYPASS, | |
| markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True}), | |
| ) | |
| print("[test_default_headless] success:", result.success) | |
| print("HTML length:", len(result.html if result.html else "")) | |
| async def test_managed_browser_persistent(): | |
| # Treating use_persistent_context=True as managed_browser scenario. | |
| async with AsyncWebCrawler( | |
| headless=False, | |
| verbose=True, | |
| user_agent_mode="random", | |
| user_agent_generator_config={"device_type": "desktop", "os_type": "mac"}, | |
| use_managed_browser=True, | |
| use_persistent_context=True, # now should behave same as managed browser | |
| user_data_dir="./outpu/test_profile", | |
| # This should store and reuse profile data across runs | |
| ) as crawler: | |
| result = await crawler.arun( | |
| url='https://www.google.com', | |
| cache_mode=CacheMode.BYPASS, | |
| markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True}) | |
| ) | |
| print("[test_managed_browser_persistent] success:", result.success) | |
| print("HTML length:", len(result.html if result.html else "")) | |
| async def test_session_reuse(): | |
| # Test creating a session, using it for multiple calls | |
| session_id = "my_session" | |
| async with AsyncWebCrawler( | |
| headless=False, | |
| verbose=True, | |
| user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36", | |
| # Fixed user-agent for consistency | |
| use_managed_browser=False, | |
| use_persistent_context=False, | |
| ) as crawler: | |
| # First call: create session | |
| result1 = await crawler.arun( | |
| url='https://www.example.com', | |
| cache_mode=CacheMode.BYPASS, | |
| session_id=session_id, | |
| markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True}) | |
| ) | |
| print("[test_session_reuse first call] success:", result1.success) | |
| # Second call: same session, possibly cookie retained | |
| result2 = await crawler.arun( | |
| url='https://www.example.com/about', | |
| cache_mode=CacheMode.BYPASS, | |
| session_id=session_id, | |
| markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True}) | |
| ) | |
| print("[test_session_reuse second call] success:", result2.success) | |
| async def test_magic_mode(): | |
| # Test magic mode with override_navigator and simulate_user | |
| async with AsyncWebCrawler( | |
| headless=False, | |
| verbose=True, | |
| user_agent_mode="random", | |
| user_agent_generator_config={"device_type": "desktop", "os_type": "windows"}, | |
| use_managed_browser=False, | |
| use_persistent_context=False, | |
| magic=True, | |
| override_navigator=True, | |
| simulate_user=True, | |
| ) as crawler: | |
| result = await crawler.arun( | |
| url='https://www.kidocode.com/degrees/business', | |
| cache_mode=CacheMode.BYPASS, | |
| markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True}) | |
| ) | |
| print("[test_magic_mode] success:", result.success) | |
| print("HTML length:", len(result.html if result.html else "")) | |
| async def test_proxy_settings(): | |
| # Test with a proxy (if available) to ensure code runs with proxy | |
| async with AsyncWebCrawler( | |
| headless=True, | |
| verbose=False, | |
| user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36", | |
| proxy="http://127.0.0.1:8080", # Assuming local proxy server for test | |
| use_managed_browser=False, | |
| use_persistent_context=False, | |
| ) as crawler: | |
| result = await crawler.arun( | |
| url='https://httpbin.org/ip', | |
| cache_mode=CacheMode.BYPASS, | |
| markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True}) | |
| ) | |
| print("[test_proxy_settings] success:", result.success) | |
| if result.success: | |
| print("HTML preview:", result.html[:200] if result.html else "") | |
| async def test_ignore_https_errors(): | |
| # Test ignore HTTPS errors with a self-signed or invalid cert domain | |
| # This is just conceptual, the domain should be one that triggers SSL error. | |
| # Using a hypothetical URL that fails SSL: | |
| async with AsyncWebCrawler( | |
| headless=True, | |
| verbose=True, | |
| user_agent="Mozilla/5.0", | |
| ignore_https_errors=True, | |
| use_managed_browser=False, | |
| use_persistent_context=False, | |
| ) as crawler: | |
| result = await crawler.arun( | |
| url='https://self-signed.badssl.com/', | |
| cache_mode=CacheMode.BYPASS, | |
| markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True}) | |
| ) | |
| print("[test_ignore_https_errors] success:", result.success) | |
| async def main(): | |
| print("Running tests...") | |
| # await test_default_headless() | |
| # await test_managed_browser_persistent() | |
| # await test_session_reuse() | |
| # await test_magic_mode() | |
| # await test_proxy_settings() | |
| await test_ignore_https_errors() | |
| if __name__ == "__main__": | |
| asyncio.run(main()) | |