Spaces:
Paused
Paused
| # File: async_webcrawler_multiple_urls_example.py | |
| import os, sys | |
| # append 2 parent directories to sys.path to import crawl4ai | |
| parent_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | |
| sys.path.append(parent_dir) | |
| import asyncio | |
| from crawl4ai import AsyncWebCrawler | |
| async def main(): | |
| # Initialize the AsyncWebCrawler | |
| async with AsyncWebCrawler(verbose=True) as crawler: | |
| # List of URLs to crawl | |
| urls = [ | |
| "https://example.com", | |
| "https://python.org", | |
| "https://github.com", | |
| "https://stackoverflow.com", | |
| "https://news.ycombinator.com" | |
| ] | |
| # Set up crawling parameters | |
| word_count_threshold = 100 | |
| # Run the crawling process for multiple URLs | |
| results = await crawler.arun_many( | |
| urls=urls, | |
| word_count_threshold=word_count_threshold, | |
| bypass_cache=True, | |
| verbose=True | |
| ) | |
| # Process the results | |
| for result in results: | |
| if result.success: | |
| print(f"Successfully crawled: {result.url}") | |
| print(f"Title: {result.metadata.get('title', 'N/A')}") | |
| print(f"Word count: {len(result.markdown.split())}") | |
| print(f"Number of links: {len(result.links.get('internal', [])) + len(result.links.get('external', []))}") | |
| print(f"Number of images: {len(result.media.get('images', []))}") | |
| print("---") | |
| else: | |
| print(f"Failed to crawl: {result.url}") | |
| print(f"Error: {result.error_message}") | |
| print("---") | |
| if __name__ == "__main__": | |
| asyncio.run(main()) |