Spaces:
Configuration error
Configuration error
Commit
·
68bd3e0
1
Parent(s):
0195b9e
Ditched idea to scrape biographic context from public LinkedIn profile. Will have users export profile and upload instead.
Browse files- .devcontainer/devcontainer.json +1 -1
- .gitignore +1 -2
- functions/__init__.py +0 -10
- functions/context_acquisition.py +2 -209
- packages.txt +0 -1
- requirements.txt +1 -3
- resumate.py +28 -23
- tests/test_context_acquisition.py +0 -249
.devcontainer/devcontainer.json
CHANGED
@@ -3,7 +3,7 @@
|
|
3 |
{
|
4 |
"name": "Python 3.10: resumate",
|
5 |
"image": "mcr.microsoft.com/devcontainers/python:0-3.11",
|
6 |
-
"onCreateCommand": "sudo apt update && sudo apt upgrade -y &&
|
7 |
"customizations": {
|
8 |
"vscode": {
|
9 |
"extensions": [
|
|
|
3 |
{
|
4 |
"name": "Python 3.10: resumate",
|
5 |
"image": "mcr.microsoft.com/devcontainers/python:0-3.11",
|
6 |
+
"onCreateCommand": "sudo apt update && sudo apt upgrade -y && pip3 install --upgrade pip && pip3 install --user -r requirements.txt",
|
7 |
"customizations": {
|
8 |
"vscode": {
|
9 |
"extensions": [
|
.gitignore
CHANGED
@@ -1,4 +1,3 @@
|
|
1 |
__pycache__
|
2 |
.vscode
|
3 |
-
.venv
|
4 |
-
html
|
|
|
1 |
__pycache__
|
2 |
.vscode
|
3 |
+
.venv
|
|
functions/__init__.py
DELETED
@@ -1,10 +0,0 @@
|
|
1 |
-
"""
|
2 |
-
Functions package for the resumate application.
|
3 |
-
|
4 |
-
This package contains modules for data acquisition, processing, and analysis
|
5 |
-
of LinkedIn profiles, GitHub profiles, and job postings.
|
6 |
-
"""
|
7 |
-
|
8 |
-
from .context_acquisition import get_linkedin_profile_html
|
9 |
-
|
10 |
-
__all__ = ['get_linkedin_profile_html']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
functions/context_acquisition.py
CHANGED
@@ -1,210 +1,3 @@
|
|
1 |
"""
|
2 |
-
|
3 |
-
|
4 |
-
Functions for acquiring context from various sources including LinkedIn profiles,
|
5 |
-
GitHub profiles, and job postings using browser automation.
|
6 |
-
"""
|
7 |
-
|
8 |
-
import time
|
9 |
-
import logging
|
10 |
-
import os
|
11 |
-
from urllib.parse import urlparse
|
12 |
-
|
13 |
-
from selenium import webdriver
|
14 |
-
from selenium.webdriver.chrome.options import Options
|
15 |
-
from selenium.webdriver.common.by import By
|
16 |
-
from selenium.webdriver.support.ui import WebDriverWait
|
17 |
-
from selenium.webdriver.support import expected_conditions as EC
|
18 |
-
from selenium.common.exceptions import TimeoutException, WebDriverException
|
19 |
-
|
20 |
-
# Set up logging
|
21 |
-
logging.basicConfig(level=logging.INFO)
|
22 |
-
logger = logging.getLogger(__name__)
|
23 |
-
|
24 |
-
|
25 |
-
def get_linkedin_profile_html(profile_url: str, wait_time: int = 10) -> str:
|
26 |
-
"""
|
27 |
-
Retrieve the HTML content of a LinkedIn profile using browser automation.
|
28 |
-
The HTML content is saved to the html directory and also returned.
|
29 |
-
|
30 |
-
Args:
|
31 |
-
profile_url (str): The URL of the LinkedIn profile to scrape
|
32 |
-
wait_time (int): Maximum time to wait for page elements to load (default: 10 seconds)
|
33 |
-
|
34 |
-
Returns:
|
35 |
-
str: The HTML content of the LinkedIn profile page
|
36 |
-
|
37 |
-
Raises:
|
38 |
-
ValueError: If the URL is not a valid LinkedIn profile URL
|
39 |
-
WebDriverException: If there's an issue with the browser automation
|
40 |
-
TimeoutException: If the page takes too long to load
|
41 |
-
|
42 |
-
Note:
|
43 |
-
The HTML content is automatically saved to html/linkedin_profile_<name>_<timestamp>.html
|
44 |
-
"""
|
45 |
-
|
46 |
-
# Validate LinkedIn URL
|
47 |
-
if not profile_url or not isinstance(profile_url, str):
|
48 |
-
raise ValueError("Profile URL must be a non-empty string")
|
49 |
-
|
50 |
-
if "linkedin.com/in/" not in profile_url:
|
51 |
-
raise ValueError("URL must be a valid LinkedIn profile URL (containing 'linkedin.com/in/')")
|
52 |
-
|
53 |
-
# Configure Chrome options for headless browsing
|
54 |
-
chrome_options = setup_chrome_driver_options()
|
55 |
-
|
56 |
-
driver = None
|
57 |
-
try:
|
58 |
-
# Initialize the Chrome driver
|
59 |
-
logger.info("Initializing browser for URL: %s", profile_url)
|
60 |
-
driver = webdriver.Chrome(options=chrome_options)
|
61 |
-
driver.set_page_load_timeout(30)
|
62 |
-
|
63 |
-
# Navigate to the LinkedIn profile
|
64 |
-
logger.info("Navigating to LinkedIn profile...")
|
65 |
-
driver.get(profile_url)
|
66 |
-
|
67 |
-
# Wait for the page to load
|
68 |
-
# Look for common LinkedIn profile elements
|
69 |
-
wait = WebDriverWait(driver, wait_time)
|
70 |
-
|
71 |
-
try:
|
72 |
-
# Wait for either the main content or login prompt
|
73 |
-
wait.until(
|
74 |
-
EC.any_of(
|
75 |
-
EC.presence_of_element_located(( # Profile header
|
76 |
-
By.CSS_SELECTOR,
|
77 |
-
".pv-top-card"
|
78 |
-
)),
|
79 |
-
EC.presence_of_element_located(( # Profile section
|
80 |
-
By.CSS_SELECTOR,
|
81 |
-
".profile-section"
|
82 |
-
)),
|
83 |
-
EC.presence_of_element_located(( # Auth wall
|
84 |
-
By.CSS_SELECTOR,
|
85 |
-
".authwall"
|
86 |
-
)),
|
87 |
-
EC.presence_of_element_located(( # Public profile
|
88 |
-
By.CSS_SELECTOR,
|
89 |
-
".public-profile"
|
90 |
-
)),
|
91 |
-
)
|
92 |
-
)
|
93 |
-
|
94 |
-
except TimeoutException:
|
95 |
-
logger.warning(
|
96 |
-
"Standard LinkedIn elements not found, proceeding with current page state"
|
97 |
-
)
|
98 |
-
|
99 |
-
# Additional wait to ensure dynamic content loads
|
100 |
-
time.sleep(2)
|
101 |
-
|
102 |
-
# Get the page HTML
|
103 |
-
html_content = driver.page_source
|
104 |
-
|
105 |
-
# Clean up HTML by removing blank lines
|
106 |
-
cleaned_html = _clean_html_content(html_content)
|
107 |
-
|
108 |
-
logger.info(
|
109 |
-
"Successfully retrieved HTML content (%d characters, cleaned to %d characters)",
|
110 |
-
len(html_content),
|
111 |
-
len(cleaned_html)
|
112 |
-
)
|
113 |
-
|
114 |
-
# Save HTML content to file
|
115 |
-
_save_html_to_file(cleaned_html, profile_url)
|
116 |
-
|
117 |
-
return cleaned_html
|
118 |
-
|
119 |
-
except WebDriverException as e:
|
120 |
-
logger.error("WebDriver error occurred: %s", str(e))
|
121 |
-
raise WebDriverException(f"Browser automation failed: {str(e)}") from e
|
122 |
-
|
123 |
-
except Exception as e:
|
124 |
-
logger.error("Unexpected error occurred: %s", str(e))
|
125 |
-
raise RuntimeError(f"Failed to retrieve LinkedIn profile: {str(e)}") from e
|
126 |
-
|
127 |
-
finally:
|
128 |
-
# Always clean up the driver
|
129 |
-
if driver:
|
130 |
-
try:
|
131 |
-
driver.quit()
|
132 |
-
logger.info("Browser session closed")
|
133 |
-
except WebDriverException as e:
|
134 |
-
logger.warning("Error closing browser: %s", str(e))
|
135 |
-
|
136 |
-
|
137 |
-
def _clean_html_content(html_content: str) -> str:
|
138 |
-
"""
|
139 |
-
Clean HTML content by removing blank lines and excessive whitespace.
|
140 |
-
|
141 |
-
Args:
|
142 |
-
html_content (str): The raw HTML content to clean
|
143 |
-
|
144 |
-
Returns:
|
145 |
-
str: Cleaned HTML content with blank lines removed
|
146 |
-
"""
|
147 |
-
# Split into lines, strip whitespace, and filter out empty lines
|
148 |
-
lines = html_content.split('\n')
|
149 |
-
cleaned_lines = [line.rstrip() for line in lines if line.strip()]
|
150 |
-
|
151 |
-
# Join back together with single newlines
|
152 |
-
return '\n'.join(cleaned_lines)
|
153 |
-
|
154 |
-
|
155 |
-
def _save_html_to_file(html_content: str, profile_url: str) -> str:
|
156 |
-
"""
|
157 |
-
Save HTML content to a file in the html directory.
|
158 |
-
|
159 |
-
Args:
|
160 |
-
html_content (str): The HTML content to save
|
161 |
-
profile_url (str): The original profile URL for filename generation
|
162 |
-
|
163 |
-
Returns:
|
164 |
-
str: The path to the saved file
|
165 |
-
"""
|
166 |
-
try:
|
167 |
-
# Create html directory if it doesn't exist
|
168 |
-
html_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'html')
|
169 |
-
os.makedirs(html_dir, exist_ok=True)
|
170 |
-
|
171 |
-
# Generate filename from URL and timestamp
|
172 |
-
parsed_url = urlparse(profile_url)
|
173 |
-
profile_name = parsed_url.path.split('/')[2] or 'unknown_profile'
|
174 |
-
filename = f"linkedin_profile_{profile_name}.html"
|
175 |
-
|
176 |
-
# Full file path
|
177 |
-
file_path = os.path.join(html_dir, filename)
|
178 |
-
|
179 |
-
# Save HTML content
|
180 |
-
with open(file_path, 'w', encoding='utf-8') as f:
|
181 |
-
f.write(html_content)
|
182 |
-
|
183 |
-
logger.info("HTML content saved to: %s", file_path)
|
184 |
-
return file_path
|
185 |
-
|
186 |
-
except Exception as e: # pylint: disable=broad-exception-caught
|
187 |
-
logger.warning("Failed to save HTML content: %s", str(e))
|
188 |
-
return ""
|
189 |
-
|
190 |
-
|
191 |
-
def setup_chrome_driver_options() -> Options:
|
192 |
-
"""
|
193 |
-
Create and configure Chrome driver options for web scraping.
|
194 |
-
|
195 |
-
Returns:
|
196 |
-
Options: Configured Chrome options object
|
197 |
-
"""
|
198 |
-
chrome_options = Options()
|
199 |
-
chrome_options.add_argument("--headless") # Run in background
|
200 |
-
chrome_options.add_argument("--no-sandbox")
|
201 |
-
chrome_options.add_argument("--disable-dev-shm-usage")
|
202 |
-
chrome_options.add_argument("--disable-gpu")
|
203 |
-
chrome_options.add_argument("--window-size=1920,1080")
|
204 |
-
chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) " +
|
205 |
-
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
|
206 |
-
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
|
207 |
-
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
|
208 |
-
chrome_options.add_experimental_option('useAutomationExtension', False)
|
209 |
-
|
210 |
-
return chrome_options
|
|
|
1 |
"""
|
2 |
+
Functions for acquiring context from various sources.
|
3 |
+
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
packages.txt
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
chromium-driver
|
|
|
|
requirements.txt
CHANGED
@@ -1,3 +1 @@
|
|
1 |
-
gradio==5.35.0
|
2 |
-
selenium>=4.0.0
|
3 |
-
webdriver-manager>=3.8.0
|
|
|
1 |
+
gradio==5.35.0
|
|
|
|
resumate.py
CHANGED
@@ -1,10 +1,10 @@
|
|
1 |
"""
|
2 |
resumate.py
|
3 |
|
4 |
-
A simple Gradio UI for collecting user profile and job post
|
5 |
|
6 |
-
This app provides
|
7 |
-
- LinkedIn
|
8 |
- GitHub profile URL
|
9 |
- LinkedIn job post URL
|
10 |
|
@@ -15,39 +15,44 @@ To run:
|
|
15 |
"""
|
16 |
|
17 |
import gradio as gr
|
18 |
-
from functions.context_acquisition import get_linkedin_profile_html
|
19 |
|
20 |
|
21 |
-
def process_inputs(
|
22 |
"""
|
23 |
-
Process the input
|
24 |
|
25 |
Args:
|
26 |
-
|
27 |
github_url (str): GitHub profile URL
|
28 |
job_post_url (str): LinkedIn job post URL
|
29 |
|
30 |
Returns:
|
31 |
-
str: Formatted output with
|
32 |
"""
|
33 |
-
result =
|
34 |
-
|
35 |
-
#
|
36 |
-
if
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
|
|
|
|
|
|
|
|
43 |
|
44 |
return result
|
45 |
|
46 |
with gr.Blocks() as demo:
|
47 |
gr.Markdown("# Resumate: Profile & Job Post Input")
|
48 |
-
|
49 |
-
|
50 |
-
|
|
|
|
|
51 |
)
|
52 |
|
53 |
github_profile = gr.Textbox(
|
@@ -61,11 +66,11 @@ with gr.Blocks() as demo:
|
|
61 |
)
|
62 |
|
63 |
submit_btn = gr.Button("Submit")
|
64 |
-
output = gr.Textbox(label="Output", lines=
|
65 |
|
66 |
submit_btn.click( # pylint: disable=no-member
|
67 |
process_inputs,
|
68 |
-
inputs=[
|
69 |
outputs=output
|
70 |
)
|
71 |
|
|
|
1 |
"""
|
2 |
resumate.py
|
3 |
|
4 |
+
A simple Gradio UI for collecting user profile and job post information.
|
5 |
|
6 |
+
This app provides inputs for:
|
7 |
+
- LinkedIn resume export PDF file upload
|
8 |
- GitHub profile URL
|
9 |
- LinkedIn job post URL
|
10 |
|
|
|
15 |
"""
|
16 |
|
17 |
import gradio as gr
|
|
|
18 |
|
19 |
|
20 |
+
def process_inputs(linkedin_pdf, github_url, job_post_url):
|
21 |
"""
|
22 |
+
Process the input files and URLs.
|
23 |
|
24 |
Args:
|
25 |
+
linkedin_pdf: Uploaded LinkedIn resume export PDF file
|
26 |
github_url (str): GitHub profile URL
|
27 |
job_post_url (str): LinkedIn job post URL
|
28 |
|
29 |
Returns:
|
30 |
+
str: Formatted output with file and URL information
|
31 |
"""
|
32 |
+
result = ""
|
33 |
+
|
34 |
+
# Process LinkedIn PDF file
|
35 |
+
if linkedin_pdf is not None:
|
36 |
+
result += f"✅ LinkedIn Resume PDF uploaded: {linkedin_pdf.name}\n"
|
37 |
+
result += f" File size: {len(linkedin_pdf.read())} bytes\n\n"
|
38 |
+
# Reset file pointer for potential future use
|
39 |
+
linkedin_pdf.seek(0)
|
40 |
+
else:
|
41 |
+
result += "❌ No LinkedIn resume PDF file uploaded\n\n"
|
42 |
+
|
43 |
+
# Process other inputs
|
44 |
+
result += f"GitHub Profile: {github_url if github_url else 'Not provided'}\n"
|
45 |
+
result += f"Job Post URL: {job_post_url if job_post_url else 'Not provided'}\n"
|
46 |
|
47 |
return result
|
48 |
|
49 |
with gr.Blocks() as demo:
|
50 |
gr.Markdown("# Resumate: Profile & Job Post Input")
|
51 |
+
|
52 |
+
linkedin_pdf = gr.File(
|
53 |
+
label="LinkedIn Resume Export PDF",
|
54 |
+
file_types=[".pdf"],
|
55 |
+
file_count="single"
|
56 |
)
|
57 |
|
58 |
github_profile = gr.Textbox(
|
|
|
66 |
)
|
67 |
|
68 |
submit_btn = gr.Button("Submit")
|
69 |
+
output = gr.Textbox(label="Output", lines=5)
|
70 |
|
71 |
submit_btn.click( # pylint: disable=no-member
|
72 |
process_inputs,
|
73 |
+
inputs=[linkedin_pdf, github_profile, job_post],
|
74 |
outputs=output
|
75 |
)
|
76 |
|
tests/test_context_acquisition.py
CHANGED
@@ -1,252 +1,3 @@
|
|
1 |
"""
|
2 |
Unit tests for the context_acquisition module.
|
3 |
"""
|
4 |
-
|
5 |
-
import unittest
|
6 |
-
import os
|
7 |
-
import tempfile
|
8 |
-
import shutil
|
9 |
-
from selenium.webdriver.chrome.options import Options
|
10 |
-
|
11 |
-
import functions.context_acquisition
|
12 |
-
|
13 |
-
# Import the functions to test
|
14 |
-
from functions.context_acquisition import (
|
15 |
-
_clean_html_content,
|
16 |
-
_save_html_to_file,
|
17 |
-
setup_chrome_driver_options
|
18 |
-
)
|
19 |
-
|
20 |
-
|
21 |
-
class TestCleanHTMLContent(unittest.TestCase):
|
22 |
-
"""Test cases for the _clean_html_content function."""
|
23 |
-
|
24 |
-
def test_remove_blank_lines(self):
|
25 |
-
"""Test removal of blank lines from HTML content."""
|
26 |
-
html_with_blanks = """<html>
|
27 |
-
|
28 |
-
<head>
|
29 |
-
<title>Test</title>
|
30 |
-
|
31 |
-
</head>
|
32 |
-
|
33 |
-
<body>
|
34 |
-
<div>Content</div>
|
35 |
-
|
36 |
-
</body>
|
37 |
-
</html>"""
|
38 |
-
|
39 |
-
expected = """<html>
|
40 |
-
<head>
|
41 |
-
<title>Test</title>
|
42 |
-
</head>
|
43 |
-
<body>
|
44 |
-
<div>Content</div>
|
45 |
-
</body>
|
46 |
-
</html>"""
|
47 |
-
|
48 |
-
result = _clean_html_content(html_with_blanks)
|
49 |
-
self.assertEqual(result, expected)
|
50 |
-
|
51 |
-
def test_strip_trailing_whitespace(self):
|
52 |
-
"""Test removal of trailing whitespace from lines."""
|
53 |
-
html_with_trailing = "<div>Content</div> \n<p>Text</p>\t\n"
|
54 |
-
expected = "<div>Content</div>\n<p>Text</p>"
|
55 |
-
|
56 |
-
result = _clean_html_content(html_with_trailing)
|
57 |
-
self.assertEqual(result, expected)
|
58 |
-
|
59 |
-
def test_empty_content(self):
|
60 |
-
"""Test handling of empty or whitespace-only content."""
|
61 |
-
self.assertEqual(_clean_html_content(""), "")
|
62 |
-
self.assertEqual(_clean_html_content(" \n\n\t "), "")
|
63 |
-
self.assertEqual(_clean_html_content("\n"), "")
|
64 |
-
|
65 |
-
def test_single_line_content(self):
|
66 |
-
"""Test cleaning of single line content."""
|
67 |
-
single_line = "<html><body>Content</body></html>"
|
68 |
-
result = _clean_html_content(single_line)
|
69 |
-
self.assertEqual(result, single_line)
|
70 |
-
|
71 |
-
def test_mixed_whitespace(self):
|
72 |
-
"""Test handling of mixed whitespace characters."""
|
73 |
-
mixed = "<div>\t\n \n\r\n<p>Text</p>\n \n</div>"
|
74 |
-
expected = "<div>\n<p>Text</p>\n</div>"
|
75 |
-
result = _clean_html_content(mixed)
|
76 |
-
self.assertEqual(result, expected)
|
77 |
-
|
78 |
-
|
79 |
-
class TestSaveHTMLToFile(unittest.TestCase):
|
80 |
-
"""Test cases for the _save_html_to_file function."""
|
81 |
-
|
82 |
-
def setUp(self):
|
83 |
-
"""Set up test fixtures with temporary directory."""
|
84 |
-
self.test_dir = tempfile.mkdtemp()
|
85 |
-
self.test_html = "<html><body>Test content</body></html>"
|
86 |
-
self.test_url = "https://www.linkedin.com/in/johndoe"
|
87 |
-
|
88 |
-
def tearDown(self):
|
89 |
-
"""Clean up temporary directory."""
|
90 |
-
if os.path.exists(self.test_dir):
|
91 |
-
shutil.rmtree(self.test_dir)
|
92 |
-
|
93 |
-
def test_successful_file_save(self):
|
94 |
-
"""Test successful saving of HTML content to file."""
|
95 |
-
# Temporarily change the file path calculation
|
96 |
-
original_dirname = os.path.dirname
|
97 |
-
|
98 |
-
def mock_dirname(path):
|
99 |
-
if path.endswith('context_acquisition.py'):
|
100 |
-
return self.test_dir
|
101 |
-
return original_dirname(path)
|
102 |
-
|
103 |
-
# Replace os.path.dirname temporarily
|
104 |
-
original_func = functions.context_acquisition.os.path.dirname
|
105 |
-
functions.context_acquisition.os.path.dirname = mock_dirname
|
106 |
-
|
107 |
-
try:
|
108 |
-
result = _save_html_to_file(self.test_html, self.test_url)
|
109 |
-
|
110 |
-
# Verify file was created
|
111 |
-
self.assertTrue(os.path.exists(result))
|
112 |
-
self.assertTrue(result.endswith('.html'))
|
113 |
-
|
114 |
-
# Verify file content
|
115 |
-
with open(result, 'r', encoding='utf-8') as f:
|
116 |
-
content = f.read()
|
117 |
-
self.assertEqual(content, self.test_html)
|
118 |
-
|
119 |
-
finally:
|
120 |
-
# Restore original function
|
121 |
-
functions.context_acquisition.os.path.dirname = original_func
|
122 |
-
|
123 |
-
|
124 |
-
class TestSetupChromeDriverOptions(unittest.TestCase):
|
125 |
-
"""Test cases for the setup_chrome_driver_options function."""
|
126 |
-
|
127 |
-
def test_chrome_options_configuration(self):
|
128 |
-
"""Test that Chrome options are properly configured."""
|
129 |
-
options = setup_chrome_driver_options()
|
130 |
-
|
131 |
-
# Verify that options object is returned
|
132 |
-
self.assertIsNotNone(options)
|
133 |
-
|
134 |
-
# Verify it's the correct type
|
135 |
-
self.assertIsInstance(options, Options)
|
136 |
-
|
137 |
-
def test_chrome_options_arguments(self):
|
138 |
-
"""Test that required Chrome arguments are set."""
|
139 |
-
options = setup_chrome_driver_options()
|
140 |
-
|
141 |
-
# Access the arguments (this is implementation dependent)
|
142 |
-
# Note: This test verifies the function runs without error
|
143 |
-
# Specific argument verification would require accessing private attributes
|
144 |
-
self.assertIsNotNone(options)
|
145 |
-
|
146 |
-
|
147 |
-
class TestURLValidation(unittest.TestCase):
|
148 |
-
"""Test cases for URL validation logic (extracted from main function)."""
|
149 |
-
|
150 |
-
def test_valid_linkedin_urls(self):
|
151 |
-
"""Test validation of valid LinkedIn URLs."""
|
152 |
-
valid_urls = [
|
153 |
-
"https://www.linkedin.com/in/johndoe",
|
154 |
-
"https://linkedin.com/in/jane-smith",
|
155 |
-
"http://www.linkedin.com/in/test123",
|
156 |
-
"https://www.linkedin.com/in/user-name-with-dashes",
|
157 |
-
]
|
158 |
-
|
159 |
-
for url in valid_urls:
|
160 |
-
# Test the validation logic directly
|
161 |
-
self.assertTrue(isinstance(url, str))
|
162 |
-
self.assertTrue(url.strip())
|
163 |
-
self.assertIn("linkedin.com/in/", url)
|
164 |
-
|
165 |
-
def test_invalid_linkedin_urls(self):
|
166 |
-
"""Test validation of invalid LinkedIn URLs."""
|
167 |
-
invalid_urls = [
|
168 |
-
"",
|
169 |
-
None,
|
170 |
-
"https://www.example.com/profile",
|
171 |
-
"https://www.linkedin.com/company/test",
|
172 |
-
"https://github.com/user",
|
173 |
-
"not-a-url",
|
174 |
-
]
|
175 |
-
|
176 |
-
for url in invalid_urls:
|
177 |
-
# Test the validation logic directly
|
178 |
-
if url is None or not isinstance(url, str):
|
179 |
-
self.assertTrue(url is None or not isinstance(url, str))
|
180 |
-
elif not url.strip():
|
181 |
-
self.assertFalse(url.strip())
|
182 |
-
else:
|
183 |
-
self.assertNotIn("linkedin.com/in/", url)
|
184 |
-
|
185 |
-
|
186 |
-
class TestHTMLContentProcessing(unittest.TestCase):
|
187 |
-
"""Test cases for HTML content processing workflows."""
|
188 |
-
|
189 |
-
def test_html_cleaning_workflow(self):
|
190 |
-
"""Test the complete HTML cleaning workflow."""
|
191 |
-
raw_html = """<!DOCTYPE html>
|
192 |
-
<html>
|
193 |
-
|
194 |
-
<head>
|
195 |
-
<title>LinkedIn Profile</title>
|
196 |
-
|
197 |
-
</head>
|
198 |
-
|
199 |
-
<body>
|
200 |
-
<div class="profile">
|
201 |
-
<h1>John Doe</h1>
|
202 |
-
|
203 |
-
<p>Software Engineer</p>
|
204 |
-
</div>
|
205 |
-
|
206 |
-
</body>
|
207 |
-
|
208 |
-
</html>"""
|
209 |
-
|
210 |
-
cleaned = _clean_html_content(raw_html)
|
211 |
-
|
212 |
-
# Verify no empty lines
|
213 |
-
lines = cleaned.split('\n')
|
214 |
-
for line in lines:
|
215 |
-
self.assertTrue(line.strip(), f"Found empty line: '{line}'")
|
216 |
-
|
217 |
-
# Verify content is preserved
|
218 |
-
self.assertIn("John Doe", cleaned)
|
219 |
-
self.assertIn("Software Engineer", cleaned)
|
220 |
-
self.assertIn("LinkedIn Profile", cleaned)
|
221 |
-
|
222 |
-
def test_minimal_html_cleaning(self):
|
223 |
-
"""Test cleaning of minimal HTML content."""
|
224 |
-
minimal_html = "<html><body>Content</body></html>"
|
225 |
-
result = _clean_html_content(minimal_html)
|
226 |
-
self.assertEqual(result, minimal_html)
|
227 |
-
|
228 |
-
def test_complex_whitespace_patterns(self):
|
229 |
-
"""Test cleaning of complex whitespace patterns."""
|
230 |
-
complex_html = """<div>
|
231 |
-
\t\t
|
232 |
-
<span>Text</span>
|
233 |
-
\t
|
234 |
-
|
235 |
-
<p>Paragraph</p>
|
236 |
-
\t
|
237 |
-
</div>"""
|
238 |
-
|
239 |
-
result = _clean_html_content(complex_html)
|
240 |
-
lines = result.split('\n')
|
241 |
-
|
242 |
-
# Should have no empty lines
|
243 |
-
for line in lines:
|
244 |
-
self.assertTrue(line.strip())
|
245 |
-
|
246 |
-
# Should preserve content
|
247 |
-
self.assertIn("Text", result)
|
248 |
-
self.assertIn("Paragraph", result)
|
249 |
-
|
250 |
-
|
251 |
-
if __name__ == '__main__':
|
252 |
-
unittest.main()
|
|
|
1 |
"""
|
2 |
Unit tests for the context_acquisition module.
|
3 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|