Spaces:

JimLin0704
/

Crawl4AI

Sleeping

Crawl4AI / tests /async /test_edge_cases.py

amaye15

test

03c0888 10 months ago

5.28 kB

	import os
	import re
	import sys
	import pytest
	import json
	from bs4 import BeautifulSoup
	import asyncio
	# Add the parent directory to the Python path
	parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
	sys.path.append(parent_dir)

	from crawl4ai.async_webcrawler import AsyncWebCrawler

	# @pytest.mark.asyncio
	# async def test_large_content_page():
	# async with AsyncWebCrawler(verbose=True) as crawler:
	# url = "https://en.wikipedia.org/wiki/List_of_largest_known_stars" # A page with a large table
	# result = await crawler.arun(url=url, bypass_cache=True)
	# assert result.success
	# assert len(result.html) > 1000000 # Expecting more than 1MB of content

	# @pytest.mark.asyncio
	# async def test_minimal_content_page():
	# async with AsyncWebCrawler(verbose=True) as crawler:
	# url = "https://example.com" # A very simple page
	# result = await crawler.arun(url=url, bypass_cache=True)
	# assert result.success
	# assert len(result.html) < 10000 # Expecting less than 10KB of content

	# @pytest.mark.asyncio
	# async def test_single_page_application():
	# async with AsyncWebCrawler(verbose=True) as crawler:
	# url = "https://reactjs.org/" # React's website is a SPA
	# result = await crawler.arun(url=url, bypass_cache=True)
	# assert result.success
	# assert "react" in result.html.lower()

	# @pytest.mark.asyncio
	# async def test_page_with_infinite_scroll():
	# async with AsyncWebCrawler(verbose=True) as crawler:
	# url = "https://news.ycombinator.com/" # Hacker News has infinite scroll
	# result = await crawler.arun(url=url, bypass_cache=True)
	# assert result.success
	# assert "hacker news" in result.html.lower()

	# @pytest.mark.asyncio
	# async def test_page_with_heavy_javascript():
	# async with AsyncWebCrawler(verbose=True) as crawler:
	# url = "https://www.airbnb.com/" # Airbnb uses a lot of JavaScript
	# result = await crawler.arun(url=url, bypass_cache=True)
	# assert result.success
	# assert "airbnb" in result.html.lower()

	# @pytest.mark.asyncio
	# async def test_page_with_mixed_content():
	# async with AsyncWebCrawler(verbose=True) as crawler:
	# url = "https://github.com/" # GitHub has a mix of static and dynamic content
	# result = await crawler.arun(url=url, bypass_cache=True)
	# assert result.success
	# assert "github" in result.html.lower()

	# Add this test to your existing test file
	@pytest.mark.asyncio
	async def test_typescript_commits_multi_page():
	first_commit = ""
	async def on_execution_started(page):
	nonlocal first_commit
	try:
	# Check if the page firct commit h4 text is different from the first commit (use document.querySelector('li.Box-sc-g0xbh4-0 h4'))
	while True:
	await page.wait_for_selector('li.Box-sc-g0xbh4-0 h4')
	commit = await page.query_selector('li.Box-sc-g0xbh4-0 h4')
	commit = await commit.evaluate('(element) => element.textContent')
	commit = re.sub(r'\s+', '', commit)
	if commit and commit != first_commit:
	first_commit = commit
	break
	await asyncio.sleep(0.5)
	except Exception as e:
	print(f"Warning: New content didn't appear after JavaScript execution: {e}")


	async with AsyncWebCrawler(verbose=True) as crawler:
	crawler.crawler_strategy.set_hook('on_execution_started', on_execution_started)

	url = "https://github.com/microsoft/TypeScript/commits/main"
	session_id = "typescript_commits_session"
	all_commits = []

	js_next_page = """
	const button = document.querySelector('a[data-testid="pagination-next-button"]');
	if (button) button.click();
	"""

	for page in range(3): # Crawl 3 pages
	result = await crawler.arun(
	url=url, # Only use URL for the first page
	session_id=session_id,
	css_selector="li.Box-sc-g0xbh4-0",
	js=js_next_page if page > 0 else None, # Don't click 'next' on the first page
	bypass_cache=True,
	js_only=page > 0 # Use js_only for subsequent pages
	)

	assert result.success, f"Failed to crawl page {page + 1}"

	# Parse the HTML and extract commits
	soup = BeautifulSoup(result.cleaned_html, 'html.parser')
	commits = soup.select("li")
	# Take first commit find h4 extract text
	first_commit = commits[0].find("h4").text
	first_commit = re.sub(r'\s+', '', first_commit)
	all_commits.extend(commits)

	print(f"Page {page + 1}: Found {len(commits)} commits")

	# Clean up the session
	await crawler.crawler_strategy.kill_session(session_id)

	# Assertions
	assert len(all_commits) >= 90, f"Expected at least 90 commits, but got {len(all_commits)}"

	print(f"Successfully crawled {len(all_commits)} commits across 3 pages")

	# Entry point for debugging
	if __name__ == "__main__":
	pytest.main([__file__, "-v"])