Spaces:
Sleeping
Sleeping
| # ## Issue #236 | |
| # - **Last Updated:** 2024-11-11 01:42:14 | |
| # - **Title:** [user data crawling opens two windows, unable to control correct user browser](https://github.com/unclecode/crawl4ai/issues/236) | |
| # - **State:** open | |
| import os, sys, time | |
| parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) | |
| sys.path.append(parent_dir) | |
| __location__ = os.path.realpath( os.path.join(os.getcwd(), os.path.dirname(__file__))) | |
| import asyncio | |
| import os | |
| import time | |
| from typing import Dict, Any | |
| from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator | |
| # Get current directory | |
| __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) | |
| def print_test_result(name: str, result: Dict[str, Any], execution_time: float): | |
| """Helper function to print test results.""" | |
| print(f"\n{'='*20} {name} {'='*20}") | |
| print(f"Execution time: {execution_time:.4f} seconds") | |
| # Save markdown to files | |
| for key, content in result.items(): | |
| if isinstance(content, str): | |
| with open(__location__ + f"/output/{name.lower()}_{key}.md", "w") as f: | |
| f.write(content) | |
| # # Print first few lines of each markdown version | |
| # for key, content in result.items(): | |
| # if isinstance(content, str): | |
| # preview = '\n'.join(content.split('\n')[:3]) | |
| # print(f"\n{key} (first 3 lines):") | |
| # print(preview) | |
| # print(f"Total length: {len(content)} characters") | |
| def test_basic_markdown_conversion(): | |
| """Test basic markdown conversion with links.""" | |
| with open(__location__ + "/data/wikipedia.html", "r") as f: | |
| cleaned_html = f.read() | |
| generator = DefaultMarkdownGenerator() | |
| start_time = time.perf_counter() | |
| result = generator.generate_markdown( | |
| cleaned_html=cleaned_html, | |
| base_url="https://en.wikipedia.org" | |
| ) | |
| execution_time = time.perf_counter() - start_time | |
| print_test_result("Basic Markdown Conversion", { | |
| 'raw': result.raw_markdown, | |
| 'with_citations': result.markdown_with_citations, | |
| 'references': result.references_markdown | |
| }, execution_time) | |
| # Basic assertions | |
| assert result.raw_markdown, "Raw markdown should not be empty" | |
| assert result.markdown_with_citations, "Markdown with citations should not be empty" | |
| assert result.references_markdown, "References should not be empty" | |
| assert "⟨" in result.markdown_with_citations, "Citations should use ⟨⟩ brackets" | |
| assert "## References" in result.references_markdown, "Should contain references section" | |
| def test_relative_links(): | |
| """Test handling of relative links with base URL.""" | |
| markdown = """ | |
| Here's a [relative link](/wiki/Apple) and an [absolute link](https://example.com). | |
| Also an [image](/images/test.png) and another [page](/wiki/Banana). | |
| """ | |
| generator = DefaultMarkdownGenerator() | |
| result = generator.generate_markdown( | |
| cleaned_html=markdown, | |
| base_url="https://en.wikipedia.org" | |
| ) | |
| assert "https://en.wikipedia.org/wiki/Apple" in result.references_markdown | |
| assert "https://example.com" in result.references_markdown | |
| assert "https://en.wikipedia.org/images/test.png" in result.references_markdown | |
| def test_duplicate_links(): | |
| """Test handling of duplicate links.""" | |
| markdown = """ | |
| Here's a [link](/test) and another [link](/test) and a [different link](/other). | |
| """ | |
| generator = DefaultMarkdownGenerator() | |
| result = generator.generate_markdown( | |
| cleaned_html=markdown, | |
| base_url="https://example.com" | |
| ) | |
| # Count citations in markdown | |
| citations = result.markdown_with_citations.count("⟨1⟩") | |
| assert citations == 2, "Same link should use same citation number" | |
| def test_link_descriptions(): | |
| """Test handling of link titles and descriptions.""" | |
| markdown = """ | |
| Here's a [link with title](/test "Test Title") and a [link with description](/other) to test. | |
| """ | |
| generator = DefaultMarkdownGenerator() | |
| result = generator.generate_markdown( | |
| cleaned_html=markdown, | |
| base_url="https://example.com" | |
| ) | |
| assert "Test Title" in result.references_markdown, "Link title should be in references" | |
| assert "link with description" in result.references_markdown, "Link text should be in references" | |
| def test_performance_large_document(): | |
| """Test performance with large document.""" | |
| with open(__location__ + "/data/wikipedia.md", "r") as f: | |
| markdown = f.read() | |
| # Test with multiple iterations | |
| iterations = 5 | |
| times = [] | |
| generator = DefaultMarkdownGenerator() | |
| for i in range(iterations): | |
| start_time = time.perf_counter() | |
| result = generator.generate_markdown( | |
| cleaned_html=markdown, | |
| base_url="https://en.wikipedia.org" | |
| ) | |
| end_time = time.perf_counter() | |
| times.append(end_time - start_time) | |
| avg_time = sum(times) / len(times) | |
| print(f"\n{'='*20} Performance Test {'='*20}") | |
| print(f"Average execution time over {iterations} iterations: {avg_time:.4f} seconds") | |
| print(f"Min time: {min(times):.4f} seconds") | |
| print(f"Max time: {max(times):.4f} seconds") | |
| def test_image_links(): | |
| """Test handling of image links.""" | |
| markdown = """ | |
| Here's an  and another . | |
| And a regular [link](/page). | |
| """ | |
| generator = DefaultMarkdownGenerator() | |
| result = generator.generate_markdown( | |
| cleaned_html=markdown, | |
| base_url="https://example.com" | |
| ) | |
| assert "![" in result.markdown_with_citations, "Image markdown syntax should be preserved" | |
| assert "Image Title" in result.references_markdown, "Image title should be in references" | |
| if __name__ == "__main__": | |
| print("Running markdown generation strategy tests...") | |
| test_basic_markdown_conversion() | |
| test_relative_links() | |
| test_duplicate_links() | |
| test_link_descriptions() | |
| test_performance_large_document() | |
| test_image_links() | |