Spaces:
Configuration error
Configuration error
""" | |
Unit tests for the context_acquisition module. | |
""" | |
import unittest | |
import os | |
import tempfile | |
import shutil | |
from selenium.webdriver.chrome.options import Options | |
import functions.context_acquisition | |
# Import the functions to test | |
from functions.context_acquisition import ( | |
_clean_html_content, | |
_save_html_to_file, | |
setup_chrome_driver_options | |
) | |
class TestCleanHTMLContent(unittest.TestCase): | |
"""Test cases for the _clean_html_content function.""" | |
def test_remove_blank_lines(self): | |
"""Test removal of blank lines from HTML content.""" | |
html_with_blanks = """<html> | |
<head> | |
<title>Test</title> | |
</head> | |
<body> | |
<div>Content</div> | |
</body> | |
</html>""" | |
expected = """<html> | |
<head> | |
<title>Test</title> | |
</head> | |
<body> | |
<div>Content</div> | |
</body> | |
</html>""" | |
result = _clean_html_content(html_with_blanks) | |
self.assertEqual(result, expected) | |
def test_strip_trailing_whitespace(self): | |
"""Test removal of trailing whitespace from lines.""" | |
html_with_trailing = "<div>Content</div> \n<p>Text</p>\t\n" | |
expected = "<div>Content</div>\n<p>Text</p>" | |
result = _clean_html_content(html_with_trailing) | |
self.assertEqual(result, expected) | |
def test_empty_content(self): | |
"""Test handling of empty or whitespace-only content.""" | |
self.assertEqual(_clean_html_content(""), "") | |
self.assertEqual(_clean_html_content(" \n\n\t "), "") | |
self.assertEqual(_clean_html_content("\n"), "") | |
def test_single_line_content(self): | |
"""Test cleaning of single line content.""" | |
single_line = "<html><body>Content</body></html>" | |
result = _clean_html_content(single_line) | |
self.assertEqual(result, single_line) | |
def test_mixed_whitespace(self): | |
"""Test handling of mixed whitespace characters.""" | |
mixed = "<div>\t\n \n\r\n<p>Text</p>\n \n</div>" | |
expected = "<div>\n<p>Text</p>\n</div>" | |
result = _clean_html_content(mixed) | |
self.assertEqual(result, expected) | |
class TestSaveHTMLToFile(unittest.TestCase): | |
"""Test cases for the _save_html_to_file function.""" | |
def setUp(self): | |
"""Set up test fixtures with temporary directory.""" | |
self.test_dir = tempfile.mkdtemp() | |
self.test_html = "<html><body>Test content</body></html>" | |
self.test_url = "https://www.linkedin.com/in/johndoe" | |
def tearDown(self): | |
"""Clean up temporary directory.""" | |
if os.path.exists(self.test_dir): | |
shutil.rmtree(self.test_dir) | |
def test_successful_file_save(self): | |
"""Test successful saving of HTML content to file.""" | |
# Temporarily change the file path calculation | |
original_dirname = os.path.dirname | |
def mock_dirname(path): | |
if path.endswith('context_acquisition.py'): | |
return self.test_dir | |
return original_dirname(path) | |
# Replace os.path.dirname temporarily | |
original_func = functions.context_acquisition.os.path.dirname | |
functions.context_acquisition.os.path.dirname = mock_dirname | |
try: | |
result = _save_html_to_file(self.test_html, self.test_url) | |
# Verify file was created | |
self.assertTrue(os.path.exists(result)) | |
self.assertTrue(result.endswith('.html')) | |
# Verify file content | |
with open(result, 'r', encoding='utf-8') as f: | |
content = f.read() | |
self.assertEqual(content, self.test_html) | |
finally: | |
# Restore original function | |
functions.context_acquisition.os.path.dirname = original_func | |
class TestSetupChromeDriverOptions(unittest.TestCase): | |
"""Test cases for the setup_chrome_driver_options function.""" | |
def test_chrome_options_configuration(self): | |
"""Test that Chrome options are properly configured.""" | |
options = setup_chrome_driver_options() | |
# Verify that options object is returned | |
self.assertIsNotNone(options) | |
# Verify it's the correct type | |
self.assertIsInstance(options, Options) | |
def test_chrome_options_arguments(self): | |
"""Test that required Chrome arguments are set.""" | |
options = setup_chrome_driver_options() | |
# Access the arguments (this is implementation dependent) | |
# Note: This test verifies the function runs without error | |
# Specific argument verification would require accessing private attributes | |
self.assertIsNotNone(options) | |
class TestURLValidation(unittest.TestCase): | |
"""Test cases for URL validation logic (extracted from main function).""" | |
def test_valid_linkedin_urls(self): | |
"""Test validation of valid LinkedIn URLs.""" | |
valid_urls = [ | |
"https://www.linkedin.com/in/johndoe", | |
"https://linkedin.com/in/jane-smith", | |
"http://www.linkedin.com/in/test123", | |
"https://www.linkedin.com/in/user-name-with-dashes", | |
] | |
for url in valid_urls: | |
# Test the validation logic directly | |
self.assertTrue(isinstance(url, str)) | |
self.assertTrue(url.strip()) | |
self.assertIn("linkedin.com/in/", url) | |
def test_invalid_linkedin_urls(self): | |
"""Test validation of invalid LinkedIn URLs.""" | |
invalid_urls = [ | |
"", | |
None, | |
"https://www.example.com/profile", | |
"https://www.linkedin.com/company/test", | |
"https://github.com/user", | |
"not-a-url", | |
] | |
for url in invalid_urls: | |
# Test the validation logic directly | |
if url is None or not isinstance(url, str): | |
self.assertTrue(url is None or not isinstance(url, str)) | |
elif not url.strip(): | |
self.assertFalse(url.strip()) | |
else: | |
self.assertNotIn("linkedin.com/in/", url) | |
class TestHTMLContentProcessing(unittest.TestCase): | |
"""Test cases for HTML content processing workflows.""" | |
def test_html_cleaning_workflow(self): | |
"""Test the complete HTML cleaning workflow.""" | |
raw_html = """<!DOCTYPE html> | |
<html> | |
<head> | |
<title>LinkedIn Profile</title> | |
</head> | |
<body> | |
<div class="profile"> | |
<h1>John Doe</h1> | |
<p>Software Engineer</p> | |
</div> | |
</body> | |
</html>""" | |
cleaned = _clean_html_content(raw_html) | |
# Verify no empty lines | |
lines = cleaned.split('\n') | |
for line in lines: | |
self.assertTrue(line.strip(), f"Found empty line: '{line}'") | |
# Verify content is preserved | |
self.assertIn("John Doe", cleaned) | |
self.assertIn("Software Engineer", cleaned) | |
self.assertIn("LinkedIn Profile", cleaned) | |
def test_minimal_html_cleaning(self): | |
"""Test cleaning of minimal HTML content.""" | |
minimal_html = "<html><body>Content</body></html>" | |
result = _clean_html_content(minimal_html) | |
self.assertEqual(result, minimal_html) | |
def test_complex_whitespace_patterns(self): | |
"""Test cleaning of complex whitespace patterns.""" | |
complex_html = """<div> | |
\t\t | |
<span>Text</span> | |
\t | |
<p>Paragraph</p> | |
\t | |
</div>""" | |
result = _clean_html_content(complex_html) | |
lines = result.split('\n') | |
# Should have no empty lines | |
for line in lines: | |
self.assertTrue(line.strip()) | |
# Should preserve content | |
self.assertIn("Text", result) | |
self.assertIn("Paragraph", result) | |
if __name__ == '__main__': | |
unittest.main() | |