Spaces:

JimLin0704
/

Crawl4AI

Sleeping

Crawl4AI / tests /async /test_0.4.2_browser_manager.py

amaye15

test

03c0888 10 months ago

6.18 kB

	import os, sys
	parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
	sys.path.append(parent_dir)
	__location__ = os.path.realpath( os.path.join(os.getcwd(), os.path.dirname(__file__)))

	import os, sys
	import asyncio
	from crawl4ai import AsyncWebCrawler, CacheMode
	from crawl4ai.content_filter_strategy import PruningContentFilter
	from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator

	# Assuming that the changes made allow different configurations
	# for managed browser, persistent context, and so forth.

	async def test_default_headless():
	async with AsyncWebCrawler(
	headless=True,
	verbose=True,
	user_agent_mode="random",
	user_agent_generator_config={"device_type": "mobile", "os_type": "android"},
	use_managed_browser=False,
	use_persistent_context=False,
	ignore_https_errors=True,
	# Testing normal ephemeral context
	) as crawler:
	result = await crawler.arun(
	url='https://www.kidocode.com/degrees/technology',
	cache_mode=CacheMode.BYPASS,
	markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True}),
	)
	print("[test_default_headless] success:", result.success)
	print("HTML length:", len(result.html if result.html else ""))

	async def test_managed_browser_persistent():
	# Treating use_persistent_context=True as managed_browser scenario.
	async with AsyncWebCrawler(
	headless=False,
	verbose=True,
	user_agent_mode="random",
	user_agent_generator_config={"device_type": "desktop", "os_type": "mac"},
	use_managed_browser=True,
	use_persistent_context=True, # now should behave same as managed browser
	user_data_dir="./outpu/test_profile",
	# This should store and reuse profile data across runs
	) as crawler:
	result = await crawler.arun(
	url='https://www.google.com',
	cache_mode=CacheMode.BYPASS,
	markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True})
	)
	print("[test_managed_browser_persistent] success:", result.success)
	print("HTML length:", len(result.html if result.html else ""))

	async def test_session_reuse():
	# Test creating a session, using it for multiple calls
	session_id = "my_session"
	async with AsyncWebCrawler(
	headless=False,
	verbose=True,
	user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36",
	# Fixed user-agent for consistency
	use_managed_browser=False,
	use_persistent_context=False,
	) as crawler:

	# First call: create session
	result1 = await crawler.arun(
	url='https://www.example.com',
	cache_mode=CacheMode.BYPASS,
	session_id=session_id,
	markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True})
	)
	print("[test_session_reuse first call] success:", result1.success)

	# Second call: same session, possibly cookie retained
	result2 = await crawler.arun(
	url='https://www.example.com/about',
	cache_mode=CacheMode.BYPASS,
	session_id=session_id,
	markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True})
	)
	print("[test_session_reuse second call] success:", result2.success)

	async def test_magic_mode():
	# Test magic mode with override_navigator and simulate_user
	async with AsyncWebCrawler(
	headless=False,
	verbose=True,
	user_agent_mode="random",
	user_agent_generator_config={"device_type": "desktop", "os_type": "windows"},
	use_managed_browser=False,
	use_persistent_context=False,
	magic=True,
	override_navigator=True,
	simulate_user=True,
	) as crawler:
	result = await crawler.arun(
	url='https://www.kidocode.com/degrees/business',
	cache_mode=CacheMode.BYPASS,
	markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True})
	)
	print("[test_magic_mode] success:", result.success)
	print("HTML length:", len(result.html if result.html else ""))

	async def test_proxy_settings():
	# Test with a proxy (if available) to ensure code runs with proxy
	async with AsyncWebCrawler(
	headless=True,
	verbose=False,
	user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36",
	proxy="http://127.0.0.1:8080", # Assuming local proxy server for test
	use_managed_browser=False,
	use_persistent_context=False,
	) as crawler:
	result = await crawler.arun(
	url='https://httpbin.org/ip',
	cache_mode=CacheMode.BYPASS,
	markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True})
	)
	print("[test_proxy_settings] success:", result.success)
	if result.success:
	print("HTML preview:", result.html[:200] if result.html else "")

	async def test_ignore_https_errors():
	# Test ignore HTTPS errors with a self-signed or invalid cert domain
	# This is just conceptual, the domain should be one that triggers SSL error.
	# Using a hypothetical URL that fails SSL:
	async with AsyncWebCrawler(
	headless=True,
	verbose=True,
	user_agent="Mozilla/5.0",
	ignore_https_errors=True,
	use_managed_browser=False,
	use_persistent_context=False,
	) as crawler:
	result = await crawler.arun(
	url='https://self-signed.badssl.com/',
	cache_mode=CacheMode.BYPASS,
	markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True})
	)
	print("[test_ignore_https_errors] success:", result.success)

	async def main():
	print("Running tests...")
	# await test_default_headless()
	# await test_managed_browser_persistent()
	# await test_session_reuse()
	# await test_magic_mode()
	# await test_proxy_settings()
	await test_ignore_https_errors()

	if __name__ == "__main__":
	asyncio.run(main())