Spaces:

re-mind
/

Crawl4AI

Paused

App Files Files Community

Crawl4AI / docs /md_v2 /basic /prefix-based-input.md

amaye15

test

03c0888 10 months ago

preview code

raw

history blame

5.55 kB

	# Prefix-Based Input Handling in Crawl4AI

	This guide will walk you through using the Crawl4AI library to crawl web pages, local HTML files, and raw HTML strings. We'll demonstrate these capabilities using a Wikipedia page as an example.

	## Crawling a Web URL

	To crawl a live web page, provide the URL starting with `http://` or `https://`, using a `CrawlerRunConfig` object:

	```python
	import asyncio
	from crawl4ai import AsyncWebCrawler
	from crawl4ai.async_configs import CrawlerRunConfig

	async def crawl_web():
	config = CrawlerRunConfig(bypass_cache=True)
	async with AsyncWebCrawler() as crawler:
	result = await crawler.arun(url="https://en.wikipedia.org/wiki/apple", config=config)
	if result.success:
	print("Markdown Content:")
	print(result.markdown)
	else:
	print(f"Failed to crawl: {result.error_message}")

	asyncio.run(crawl_web())
	```

	## Crawling a Local HTML File

	To crawl a local HTML file, prefix the file path with `file://`.

	```python
	import asyncio
	from crawl4ai import AsyncWebCrawler
	from crawl4ai.async_configs import CrawlerRunConfig

	async def crawl_local_file():
	local_file_path = "/path/to/apple.html" # Replace with your file path
	file_url = f"file://{local_file_path}"
	config = CrawlerRunConfig(bypass_cache=True)

	async with AsyncWebCrawler() as crawler:
	result = await crawler.arun(url=file_url, config=config)
	if result.success:
	print("Markdown Content from Local File:")
	print(result.markdown)
	else:
	print(f"Failed to crawl local file: {result.error_message}")

	asyncio.run(crawl_local_file())
	```

	## Crawling Raw HTML Content

	To crawl raw HTML content, prefix the HTML string with `raw:`.

	```python
	import asyncio
	from crawl4ai import AsyncWebCrawler
	from crawl4ai.async_configs import CrawlerRunConfig

	async def crawl_raw_html():
	raw_html = "<html><body><h1>Hello, World!</h1></body></html>"
	raw_html_url = f"raw:{raw_html}"
	config = CrawlerRunConfig(bypass_cache=True)

	async with AsyncWebCrawler() as crawler:
	result = await crawler.arun(url=raw_html_url, config=config)
	if result.success:
	print("Markdown Content from Raw HTML:")
	print(result.markdown)
	else:
	print(f"Failed to crawl raw HTML: {result.error_message}")

	asyncio.run(crawl_raw_html())
	```

	---

	# Complete Example

	Below is a comprehensive script that:

	1. Crawls the Wikipedia page for "Apple."
	2. Saves the HTML content to a local file (`apple.html`).
	3. Crawls the local HTML file and verifies the markdown length matches the original crawl.
	4. Crawls the raw HTML content from the saved file and verifies consistency.

	```python
	import os
	import sys
	import asyncio
	from pathlib import Path
	from crawl4ai import AsyncWebCrawler
	from crawl4ai.async_configs import CrawlerRunConfig

	async def main():
	wikipedia_url = "https://en.wikipedia.org/wiki/apple"
	script_dir = Path(__file__).parent
	html_file_path = script_dir / "apple.html"

	async with AsyncWebCrawler() as crawler:
	# Step 1: Crawl the Web URL
	print("\n=== Step 1: Crawling the Wikipedia URL ===")
	web_config = CrawlerRunConfig(bypass_cache=True)
	result = await crawler.arun(url=wikipedia_url, config=web_config)

	if not result.success:
	print(f"Failed to crawl {wikipedia_url}: {result.error_message}")
	return

	with open(html_file_path, 'w', encoding='utf-8') as f:
	f.write(result.html)
	web_crawl_length = len(result.markdown)
	print(f"Length of markdown from web crawl: {web_crawl_length}\n")

	# Step 2: Crawl from the Local HTML File
	print("=== Step 2: Crawling from the Local HTML File ===")
	file_url = f"file://{html_file_path.resolve()}"
	file_config = CrawlerRunConfig(bypass_cache=True)
	local_result = await crawler.arun(url=file_url, config=file_config)

	if not local_result.success:
	print(f"Failed to crawl local file {file_url}: {local_result.error_message}")
	return

	local_crawl_length = len(local_result.markdown)
	assert web_crawl_length == local_crawl_length, "Markdown length mismatch"
	print("✅ Markdown length matches between web and local file crawl.\n")

	# Step 3: Crawl Using Raw HTML Content
	print("=== Step 3: Crawling Using Raw HTML Content ===")
	with open(html_file_path, 'r', encoding='utf-8') as f:
	raw_html_content = f.read()
	raw_html_url = f"raw:{raw_html_content}"
	raw_config = CrawlerRunConfig(bypass_cache=True)
	raw_result = await crawler.arun(url=raw_html_url, config=raw_config)

	if not raw_result.success:
	print(f"Failed to crawl raw HTML content: {raw_result.error_message}")
	return

	raw_crawl_length = len(raw_result.markdown)
	assert web_crawl_length == raw_crawl_length, "Markdown length mismatch"
	print("✅ Markdown length matches between web and raw HTML crawl.\n")

	print("All tests passed successfully!")
	if html_file_path.exists():
	os.remove(html_file_path)

	if __name__ == "__main__":
	asyncio.run(main())
	```

	---

	# Conclusion

	With the unified `url` parameter and prefix-based handling in Crawl4AI, you can seamlessly handle web URLs, local HTML files, and raw HTML content. Use `CrawlerRunConfig` for flexible and consistent configuration in all scenarios.