Spaces:
Configuration error
Configuration error
Fixed LinkedIn HTML file name.
Browse files
functions/context_acquisition.py
CHANGED
@@ -8,7 +8,6 @@ GitHub profiles, and job postings using browser automation.
|
|
8 |
import time
|
9 |
import logging
|
10 |
import os
|
11 |
-
from datetime import datetime
|
12 |
from urllib.parse import urlparse
|
13 |
|
14 |
from selenium import webdriver
|
@@ -110,12 +109,16 @@ def get_linkedin_profile_html(profile_url: str, wait_time: int = 10) -> str:
|
|
110 |
# Get the page HTML
|
111 |
html_content = driver.page_source
|
112 |
|
113 |
-
|
|
|
|
|
|
|
|
|
114 |
|
115 |
# Save HTML content to file
|
116 |
-
_save_html_to_file(
|
117 |
|
118 |
-
return
|
119 |
|
120 |
except WebDriverException as e:
|
121 |
logger.error("WebDriver error occurred: %s", str(e))
|
@@ -135,6 +138,24 @@ def get_linkedin_profile_html(profile_url: str, wait_time: int = 10) -> str:
|
|
135 |
logger.warning("Error closing browser: %s", str(e))
|
136 |
|
137 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
138 |
def _save_html_to_file(html_content: str, profile_url: str) -> str:
|
139 |
"""
|
140 |
Save HTML content to a file in the html directory.
|
@@ -153,9 +174,10 @@ def _save_html_to_file(html_content: str, profile_url: str) -> str:
|
|
153 |
|
154 |
# Generate filename from URL and timestamp
|
155 |
parsed_url = urlparse(profile_url)
|
156 |
-
|
157 |
-
|
158 |
-
|
|
|
159 |
|
160 |
# Full file path
|
161 |
file_path = os.path.join(html_dir, filename)
|
|
|
8 |
import time
|
9 |
import logging
|
10 |
import os
|
|
|
11 |
from urllib.parse import urlparse
|
12 |
|
13 |
from selenium import webdriver
|
|
|
109 |
# Get the page HTML
|
110 |
html_content = driver.page_source
|
111 |
|
112 |
+
# Clean up HTML by removing blank lines
|
113 |
+
cleaned_html = _clean_html_content(html_content)
|
114 |
+
|
115 |
+
logger.info("Successfully retrieved HTML content (%d characters, cleaned to %d characters)",
|
116 |
+
len(html_content), len(cleaned_html))
|
117 |
|
118 |
# Save HTML content to file
|
119 |
+
_save_html_to_file(cleaned_html, profile_url)
|
120 |
|
121 |
+
return cleaned_html
|
122 |
|
123 |
except WebDriverException as e:
|
124 |
logger.error("WebDriver error occurred: %s", str(e))
|
|
|
138 |
logger.warning("Error closing browser: %s", str(e))
|
139 |
|
140 |
|
141 |
+
def _clean_html_content(html_content: str) -> str:
|
142 |
+
"""
|
143 |
+
Clean HTML content by removing blank lines and excessive whitespace.
|
144 |
+
|
145 |
+
Args:
|
146 |
+
html_content (str): The raw HTML content to clean
|
147 |
+
|
148 |
+
Returns:
|
149 |
+
str: Cleaned HTML content with blank lines removed
|
150 |
+
"""
|
151 |
+
# Split into lines, strip whitespace, and filter out empty lines
|
152 |
+
lines = html_content.split('\n')
|
153 |
+
cleaned_lines = [line.rstrip() for line in lines if line.strip()]
|
154 |
+
|
155 |
+
# Join back together with single newlines
|
156 |
+
return '\n'.join(cleaned_lines)
|
157 |
+
|
158 |
+
|
159 |
def _save_html_to_file(html_content: str, profile_url: str) -> str:
|
160 |
"""
|
161 |
Save HTML content to a file in the html directory.
|
|
|
174 |
|
175 |
# Generate filename from URL and timestamp
|
176 |
parsed_url = urlparse(profile_url)
|
177 |
+
print(parsed_url)
|
178 |
+
profile_name = parsed_url.path.split('/')[2] or 'unknown_profile'
|
179 |
+
print(profile_name)
|
180 |
+
filename = f"linkedin_profile_{profile_name}.html"
|
181 |
|
182 |
# Full file path
|
183 |
file_path = os.path.join(html_dir, filename)
|