Spaces:
Configuration error
Configuration error
Added unittests for context acquisition functions.
Browse files
tests/test_context_acquisition.py
ADDED
@@ -0,0 +1,252 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Unit tests for the context_acquisition module.
|
3 |
+
"""
|
4 |
+
|
5 |
+
import unittest
|
6 |
+
import os
|
7 |
+
import tempfile
|
8 |
+
import shutil
|
9 |
+
from selenium.webdriver.chrome.options import Options
|
10 |
+
|
11 |
+
import functions.context_acquisition
|
12 |
+
|
13 |
+
# Import the functions to test
|
14 |
+
from functions.context_acquisition import (
|
15 |
+
_clean_html_content,
|
16 |
+
_save_html_to_file,
|
17 |
+
setup_chrome_driver_options
|
18 |
+
)
|
19 |
+
|
20 |
+
|
21 |
+
class TestCleanHTMLContent(unittest.TestCase):
|
22 |
+
"""Test cases for the _clean_html_content function."""
|
23 |
+
|
24 |
+
def test_remove_blank_lines(self):
|
25 |
+
"""Test removal of blank lines from HTML content."""
|
26 |
+
html_with_blanks = """<html>
|
27 |
+
|
28 |
+
<head>
|
29 |
+
<title>Test</title>
|
30 |
+
|
31 |
+
</head>
|
32 |
+
|
33 |
+
<body>
|
34 |
+
<div>Content</div>
|
35 |
+
|
36 |
+
</body>
|
37 |
+
</html>"""
|
38 |
+
|
39 |
+
expected = """<html>
|
40 |
+
<head>
|
41 |
+
<title>Test</title>
|
42 |
+
</head>
|
43 |
+
<body>
|
44 |
+
<div>Content</div>
|
45 |
+
</body>
|
46 |
+
</html>"""
|
47 |
+
|
48 |
+
result = _clean_html_content(html_with_blanks)
|
49 |
+
self.assertEqual(result, expected)
|
50 |
+
|
51 |
+
def test_strip_trailing_whitespace(self):
|
52 |
+
"""Test removal of trailing whitespace from lines."""
|
53 |
+
html_with_trailing = "<div>Content</div> \n<p>Text</p>\t\n"
|
54 |
+
expected = "<div>Content</div>\n<p>Text</p>"
|
55 |
+
|
56 |
+
result = _clean_html_content(html_with_trailing)
|
57 |
+
self.assertEqual(result, expected)
|
58 |
+
|
59 |
+
def test_empty_content(self):
|
60 |
+
"""Test handling of empty or whitespace-only content."""
|
61 |
+
self.assertEqual(_clean_html_content(""), "")
|
62 |
+
self.assertEqual(_clean_html_content(" \n\n\t "), "")
|
63 |
+
self.assertEqual(_clean_html_content("\n"), "")
|
64 |
+
|
65 |
+
def test_single_line_content(self):
|
66 |
+
"""Test cleaning of single line content."""
|
67 |
+
single_line = "<html><body>Content</body></html>"
|
68 |
+
result = _clean_html_content(single_line)
|
69 |
+
self.assertEqual(result, single_line)
|
70 |
+
|
71 |
+
def test_mixed_whitespace(self):
|
72 |
+
"""Test handling of mixed whitespace characters."""
|
73 |
+
mixed = "<div>\t\n \n\r\n<p>Text</p>\n \n</div>"
|
74 |
+
expected = "<div>\n<p>Text</p>\n</div>"
|
75 |
+
result = _clean_html_content(mixed)
|
76 |
+
self.assertEqual(result, expected)
|
77 |
+
|
78 |
+
|
79 |
+
class TestSaveHTMLToFile(unittest.TestCase):
|
80 |
+
"""Test cases for the _save_html_to_file function."""
|
81 |
+
|
82 |
+
def setUp(self):
|
83 |
+
"""Set up test fixtures with temporary directory."""
|
84 |
+
self.test_dir = tempfile.mkdtemp()
|
85 |
+
self.test_html = "<html><body>Test content</body></html>"
|
86 |
+
self.test_url = "https://www.linkedin.com/in/johndoe"
|
87 |
+
|
88 |
+
def tearDown(self):
|
89 |
+
"""Clean up temporary directory."""
|
90 |
+
if os.path.exists(self.test_dir):
|
91 |
+
shutil.rmtree(self.test_dir)
|
92 |
+
|
93 |
+
def test_successful_file_save(self):
|
94 |
+
"""Test successful saving of HTML content to file."""
|
95 |
+
# Temporarily change the file path calculation
|
96 |
+
original_dirname = os.path.dirname
|
97 |
+
|
98 |
+
def mock_dirname(path):
|
99 |
+
if path.endswith('context_acquisition.py'):
|
100 |
+
return self.test_dir
|
101 |
+
return original_dirname(path)
|
102 |
+
|
103 |
+
# Replace os.path.dirname temporarily
|
104 |
+
original_func = functions.context_acquisition.os.path.dirname
|
105 |
+
functions.context_acquisition.os.path.dirname = mock_dirname
|
106 |
+
|
107 |
+
try:
|
108 |
+
result = _save_html_to_file(self.test_html, self.test_url)
|
109 |
+
|
110 |
+
# Verify file was created
|
111 |
+
self.assertTrue(os.path.exists(result))
|
112 |
+
self.assertTrue(result.endswith('.html'))
|
113 |
+
|
114 |
+
# Verify file content
|
115 |
+
with open(result, 'r', encoding='utf-8') as f:
|
116 |
+
content = f.read()
|
117 |
+
self.assertEqual(content, self.test_html)
|
118 |
+
|
119 |
+
finally:
|
120 |
+
# Restore original function
|
121 |
+
functions.context_acquisition.os.path.dirname = original_func
|
122 |
+
|
123 |
+
|
124 |
+
class TestSetupChromeDriverOptions(unittest.TestCase):
|
125 |
+
"""Test cases for the setup_chrome_driver_options function."""
|
126 |
+
|
127 |
+
def test_chrome_options_configuration(self):
|
128 |
+
"""Test that Chrome options are properly configured."""
|
129 |
+
options = setup_chrome_driver_options()
|
130 |
+
|
131 |
+
# Verify that options object is returned
|
132 |
+
self.assertIsNotNone(options)
|
133 |
+
|
134 |
+
# Verify it's the correct type
|
135 |
+
self.assertIsInstance(options, Options)
|
136 |
+
|
137 |
+
def test_chrome_options_arguments(self):
|
138 |
+
"""Test that required Chrome arguments are set."""
|
139 |
+
options = setup_chrome_driver_options()
|
140 |
+
|
141 |
+
# Access the arguments (this is implementation dependent)
|
142 |
+
# Note: This test verifies the function runs without error
|
143 |
+
# Specific argument verification would require accessing private attributes
|
144 |
+
self.assertIsNotNone(options)
|
145 |
+
|
146 |
+
|
147 |
+
class TestURLValidation(unittest.TestCase):
|
148 |
+
"""Test cases for URL validation logic (extracted from main function)."""
|
149 |
+
|
150 |
+
def test_valid_linkedin_urls(self):
|
151 |
+
"""Test validation of valid LinkedIn URLs."""
|
152 |
+
valid_urls = [
|
153 |
+
"https://www.linkedin.com/in/johndoe",
|
154 |
+
"https://linkedin.com/in/jane-smith",
|
155 |
+
"http://www.linkedin.com/in/test123",
|
156 |
+
"https://www.linkedin.com/in/user-name-with-dashes",
|
157 |
+
]
|
158 |
+
|
159 |
+
for url in valid_urls:
|
160 |
+
# Test the validation logic directly
|
161 |
+
self.assertTrue(isinstance(url, str))
|
162 |
+
self.assertTrue(url.strip())
|
163 |
+
self.assertIn("linkedin.com/in/", url)
|
164 |
+
|
165 |
+
def test_invalid_linkedin_urls(self):
|
166 |
+
"""Test validation of invalid LinkedIn URLs."""
|
167 |
+
invalid_urls = [
|
168 |
+
"",
|
169 |
+
None,
|
170 |
+
"https://www.example.com/profile",
|
171 |
+
"https://www.linkedin.com/company/test",
|
172 |
+
"https://github.com/user",
|
173 |
+
"not-a-url",
|
174 |
+
]
|
175 |
+
|
176 |
+
for url in invalid_urls:
|
177 |
+
# Test the validation logic directly
|
178 |
+
if url is None or not isinstance(url, str):
|
179 |
+
self.assertTrue(url is None or not isinstance(url, str))
|
180 |
+
elif not url.strip():
|
181 |
+
self.assertFalse(url.strip())
|
182 |
+
else:
|
183 |
+
self.assertNotIn("linkedin.com/in/", url)
|
184 |
+
|
185 |
+
|
186 |
+
class TestHTMLContentProcessing(unittest.TestCase):
|
187 |
+
"""Test cases for HTML content processing workflows."""
|
188 |
+
|
189 |
+
def test_html_cleaning_workflow(self):
|
190 |
+
"""Test the complete HTML cleaning workflow."""
|
191 |
+
raw_html = """<!DOCTYPE html>
|
192 |
+
<html>
|
193 |
+
|
194 |
+
<head>
|
195 |
+
<title>LinkedIn Profile</title>
|
196 |
+
|
197 |
+
</head>
|
198 |
+
|
199 |
+
<body>
|
200 |
+
<div class="profile">
|
201 |
+
<h1>John Doe</h1>
|
202 |
+
|
203 |
+
<p>Software Engineer</p>
|
204 |
+
</div>
|
205 |
+
|
206 |
+
</body>
|
207 |
+
|
208 |
+
</html>"""
|
209 |
+
|
210 |
+
cleaned = _clean_html_content(raw_html)
|
211 |
+
|
212 |
+
# Verify no empty lines
|
213 |
+
lines = cleaned.split('\n')
|
214 |
+
for line in lines:
|
215 |
+
self.assertTrue(line.strip(), f"Found empty line: '{line}'")
|
216 |
+
|
217 |
+
# Verify content is preserved
|
218 |
+
self.assertIn("John Doe", cleaned)
|
219 |
+
self.assertIn("Software Engineer", cleaned)
|
220 |
+
self.assertIn("LinkedIn Profile", cleaned)
|
221 |
+
|
222 |
+
def test_minimal_html_cleaning(self):
|
223 |
+
"""Test cleaning of minimal HTML content."""
|
224 |
+
minimal_html = "<html><body>Content</body></html>"
|
225 |
+
result = _clean_html_content(minimal_html)
|
226 |
+
self.assertEqual(result, minimal_html)
|
227 |
+
|
228 |
+
def test_complex_whitespace_patterns(self):
|
229 |
+
"""Test cleaning of complex whitespace patterns."""
|
230 |
+
complex_html = """<div>
|
231 |
+
\t\t
|
232 |
+
<span>Text</span>
|
233 |
+
\t
|
234 |
+
|
235 |
+
<p>Paragraph</p>
|
236 |
+
\t
|
237 |
+
</div>"""
|
238 |
+
|
239 |
+
result = _clean_html_content(complex_html)
|
240 |
+
lines = result.split('\n')
|
241 |
+
|
242 |
+
# Should have no empty lines
|
243 |
+
for line in lines:
|
244 |
+
self.assertTrue(line.strip())
|
245 |
+
|
246 |
+
# Should preserve content
|
247 |
+
self.assertIn("Text", result)
|
248 |
+
self.assertIn("Paragraph", result)
|
249 |
+
|
250 |
+
|
251 |
+
if __name__ == '__main__':
|
252 |
+
unittest.main()
|