Spaces:
Sleeping
Sleeping
| import os | |
| import re | |
| import sys | |
| import pytest | |
| import json | |
| from bs4 import BeautifulSoup | |
| import asyncio | |
| # Add the parent directory to the Python path | |
| parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) | |
| sys.path.append(parent_dir) | |
| from crawl4ai.async_webcrawler import AsyncWebCrawler | |
| # @pytest.mark.asyncio | |
| # async def test_large_content_page(): | |
| # async with AsyncWebCrawler(verbose=True) as crawler: | |
| # url = "https://en.wikipedia.org/wiki/List_of_largest_known_stars" # A page with a large table | |
| # result = await crawler.arun(url=url, bypass_cache=True) | |
| # assert result.success | |
| # assert len(result.html) > 1000000 # Expecting more than 1MB of content | |
| # @pytest.mark.asyncio | |
| # async def test_minimal_content_page(): | |
| # async with AsyncWebCrawler(verbose=True) as crawler: | |
| # url = "https://example.com" # A very simple page | |
| # result = await crawler.arun(url=url, bypass_cache=True) | |
| # assert result.success | |
| # assert len(result.html) < 10000 # Expecting less than 10KB of content | |
| # @pytest.mark.asyncio | |
| # async def test_single_page_application(): | |
| # async with AsyncWebCrawler(verbose=True) as crawler: | |
| # url = "https://reactjs.org/" # React's website is a SPA | |
| # result = await crawler.arun(url=url, bypass_cache=True) | |
| # assert result.success | |
| # assert "react" in result.html.lower() | |
| # @pytest.mark.asyncio | |
| # async def test_page_with_infinite_scroll(): | |
| # async with AsyncWebCrawler(verbose=True) as crawler: | |
| # url = "https://news.ycombinator.com/" # Hacker News has infinite scroll | |
| # result = await crawler.arun(url=url, bypass_cache=True) | |
| # assert result.success | |
| # assert "hacker news" in result.html.lower() | |
| # @pytest.mark.asyncio | |
| # async def test_page_with_heavy_javascript(): | |
| # async with AsyncWebCrawler(verbose=True) as crawler: | |
| # url = "https://www.airbnb.com/" # Airbnb uses a lot of JavaScript | |
| # result = await crawler.arun(url=url, bypass_cache=True) | |
| # assert result.success | |
| # assert "airbnb" in result.html.lower() | |
| # @pytest.mark.asyncio | |
| # async def test_page_with_mixed_content(): | |
| # async with AsyncWebCrawler(verbose=True) as crawler: | |
| # url = "https://github.com/" # GitHub has a mix of static and dynamic content | |
| # result = await crawler.arun(url=url, bypass_cache=True) | |
| # assert result.success | |
| # assert "github" in result.html.lower() | |
| # Add this test to your existing test file | |
| async def test_typescript_commits_multi_page(): | |
| first_commit = "" | |
| async def on_execution_started(page): | |
| nonlocal first_commit | |
| try: | |
| # Check if the page firct commit h4 text is different from the first commit (use document.querySelector('li.Box-sc-g0xbh4-0 h4')) | |
| while True: | |
| await page.wait_for_selector('li.Box-sc-g0xbh4-0 h4') | |
| commit = await page.query_selector('li.Box-sc-g0xbh4-0 h4') | |
| commit = await commit.evaluate('(element) => element.textContent') | |
| commit = re.sub(r'\s+', '', commit) | |
| if commit and commit != first_commit: | |
| first_commit = commit | |
| break | |
| await asyncio.sleep(0.5) | |
| except Exception as e: | |
| print(f"Warning: New content didn't appear after JavaScript execution: {e}") | |
| async with AsyncWebCrawler(verbose=True) as crawler: | |
| crawler.crawler_strategy.set_hook('on_execution_started', on_execution_started) | |
| url = "https://github.com/microsoft/TypeScript/commits/main" | |
| session_id = "typescript_commits_session" | |
| all_commits = [] | |
| js_next_page = """ | |
| const button = document.querySelector('a[data-testid="pagination-next-button"]'); | |
| if (button) button.click(); | |
| """ | |
| for page in range(3): # Crawl 3 pages | |
| result = await crawler.arun( | |
| url=url, # Only use URL for the first page | |
| session_id=session_id, | |
| css_selector="li.Box-sc-g0xbh4-0", | |
| js=js_next_page if page > 0 else None, # Don't click 'next' on the first page | |
| bypass_cache=True, | |
| js_only=page > 0 # Use js_only for subsequent pages | |
| ) | |
| assert result.success, f"Failed to crawl page {page + 1}" | |
| # Parse the HTML and extract commits | |
| soup = BeautifulSoup(result.cleaned_html, 'html.parser') | |
| commits = soup.select("li") | |
| # Take first commit find h4 extract text | |
| first_commit = commits[0].find("h4").text | |
| first_commit = re.sub(r'\s+', '', first_commit) | |
| all_commits.extend(commits) | |
| print(f"Page {page + 1}: Found {len(commits)} commits") | |
| # Clean up the session | |
| await crawler.crawler_strategy.kill_session(session_id) | |
| # Assertions | |
| assert len(all_commits) >= 90, f"Expected at least 90 commits, but got {len(all_commits)}" | |
| print(f"Successfully crawled {len(all_commits)} commits across 3 pages") | |
| # Entry point for debugging | |
| if __name__ == "__main__": | |
| pytest.main([__file__, "-v"]) |