Spaces:
Configuration error
Configuration error
Added function to save linkedin profile html.
Browse files- .gitignore +2 -1
- functions/context_acquisition.py +52 -1
.gitignore
CHANGED
|
@@ -1,3 +1,4 @@
|
|
| 1 |
__pycache__
|
| 2 |
.vscode
|
| 3 |
-
.venv
|
|
|
|
|
|
| 1 |
__pycache__
|
| 2 |
.vscode
|
| 3 |
+
.venv
|
| 4 |
+
html
|
functions/context_acquisition.py
CHANGED
|
@@ -7,6 +7,9 @@ GitHub profiles, and job postings using browser automation.
|
|
| 7 |
|
| 8 |
import time
|
| 9 |
import logging
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
from selenium import webdriver
|
| 12 |
from selenium.webdriver.chrome.options import Options
|
|
@@ -23,6 +26,7 @@ logger = logging.getLogger(__name__)
|
|
| 23 |
def get_linkedin_profile_html(profile_url: str, wait_time: int = 10) -> str:
|
| 24 |
"""
|
| 25 |
Retrieve the HTML content of a LinkedIn profile using browser automation.
|
|
|
|
| 26 |
|
| 27 |
Args:
|
| 28 |
profile_url (str): The URL of the LinkedIn profile to scrape
|
|
@@ -35,6 +39,9 @@ def get_linkedin_profile_html(profile_url: str, wait_time: int = 10) -> str:
|
|
| 35 |
ValueError: If the URL is not a valid LinkedIn profile URL
|
| 36 |
WebDriverException: If there's an issue with the browser automation
|
| 37 |
TimeoutException: If the page takes too long to load
|
|
|
|
|
|
|
|
|
|
| 38 |
"""
|
| 39 |
|
| 40 |
# Validate LinkedIn URL
|
|
@@ -91,8 +98,11 @@ def get_linkedin_profile_html(profile_url: str, wait_time: int = 10) -> str:
|
|
| 91 |
)),
|
| 92 |
)
|
| 93 |
)
|
|
|
|
| 94 |
except TimeoutException:
|
| 95 |
-
logger.warning(
|
|
|
|
|
|
|
| 96 |
|
| 97 |
# Additional wait to ensure dynamic content loads
|
| 98 |
time.sleep(2)
|
|
@@ -101,6 +111,10 @@ def get_linkedin_profile_html(profile_url: str, wait_time: int = 10) -> str:
|
|
| 101 |
html_content = driver.page_source
|
| 102 |
|
| 103 |
logger.info("Successfully retrieved HTML content (%d characters)", len(html_content))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
return html_content
|
| 105 |
|
| 106 |
except WebDriverException as e:
|
|
@@ -121,6 +135,43 @@ def get_linkedin_profile_html(profile_url: str, wait_time: int = 10) -> str:
|
|
| 121 |
logger.warning("Error closing browser: %s", str(e))
|
| 122 |
|
| 123 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
def setup_chrome_driver_options() -> Options:
|
| 125 |
"""
|
| 126 |
Create and configure Chrome driver options for web scraping.
|
|
|
|
| 7 |
|
| 8 |
import time
|
| 9 |
import logging
|
| 10 |
+
import os
|
| 11 |
+
from datetime import datetime
|
| 12 |
+
from urllib.parse import urlparse
|
| 13 |
|
| 14 |
from selenium import webdriver
|
| 15 |
from selenium.webdriver.chrome.options import Options
|
|
|
|
| 26 |
def get_linkedin_profile_html(profile_url: str, wait_time: int = 10) -> str:
|
| 27 |
"""
|
| 28 |
Retrieve the HTML content of a LinkedIn profile using browser automation.
|
| 29 |
+
The HTML content is saved to the html directory and also returned.
|
| 30 |
|
| 31 |
Args:
|
| 32 |
profile_url (str): The URL of the LinkedIn profile to scrape
|
|
|
|
| 39 |
ValueError: If the URL is not a valid LinkedIn profile URL
|
| 40 |
WebDriverException: If there's an issue with the browser automation
|
| 41 |
TimeoutException: If the page takes too long to load
|
| 42 |
+
|
| 43 |
+
Note:
|
| 44 |
+
The HTML content is automatically saved to html/linkedin_profile_<name>_<timestamp>.html
|
| 45 |
"""
|
| 46 |
|
| 47 |
# Validate LinkedIn URL
|
|
|
|
| 98 |
)),
|
| 99 |
)
|
| 100 |
)
|
| 101 |
+
|
| 102 |
except TimeoutException:
|
| 103 |
+
logger.warning(
|
| 104 |
+
"Standard LinkedIn elements not found, proceeding with current page state"
|
| 105 |
+
)
|
| 106 |
|
| 107 |
# Additional wait to ensure dynamic content loads
|
| 108 |
time.sleep(2)
|
|
|
|
| 111 |
html_content = driver.page_source
|
| 112 |
|
| 113 |
logger.info("Successfully retrieved HTML content (%d characters)", len(html_content))
|
| 114 |
+
|
| 115 |
+
# Save HTML content to file
|
| 116 |
+
_save_html_to_file(html_content, profile_url)
|
| 117 |
+
|
| 118 |
return html_content
|
| 119 |
|
| 120 |
except WebDriverException as e:
|
|
|
|
| 135 |
logger.warning("Error closing browser: %s", str(e))
|
| 136 |
|
| 137 |
|
| 138 |
+
def _save_html_to_file(html_content: str, profile_url: str) -> str:
|
| 139 |
+
"""
|
| 140 |
+
Save HTML content to a file in the html directory.
|
| 141 |
+
|
| 142 |
+
Args:
|
| 143 |
+
html_content (str): The HTML content to save
|
| 144 |
+
profile_url (str): The original profile URL for filename generation
|
| 145 |
+
|
| 146 |
+
Returns:
|
| 147 |
+
str: The path to the saved file
|
| 148 |
+
"""
|
| 149 |
+
try:
|
| 150 |
+
# Create html directory if it doesn't exist
|
| 151 |
+
html_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'html')
|
| 152 |
+
os.makedirs(html_dir, exist_ok=True)
|
| 153 |
+
|
| 154 |
+
# Generate filename from URL and timestamp
|
| 155 |
+
parsed_url = urlparse(profile_url)
|
| 156 |
+
profile_name = parsed_url.path.split('/')[-1] or 'unknown_profile'
|
| 157 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 158 |
+
filename = f"linkedin_profile_{profile_name}_{timestamp}.html"
|
| 159 |
+
|
| 160 |
+
# Full file path
|
| 161 |
+
file_path = os.path.join(html_dir, filename)
|
| 162 |
+
|
| 163 |
+
# Save HTML content
|
| 164 |
+
with open(file_path, 'w', encoding='utf-8') as f:
|
| 165 |
+
f.write(html_content)
|
| 166 |
+
|
| 167 |
+
logger.info("HTML content saved to: %s", file_path)
|
| 168 |
+
return file_path
|
| 169 |
+
|
| 170 |
+
except Exception as e: # pylint: disable=broad-exception-caught
|
| 171 |
+
logger.warning("Failed to save HTML content: %s", str(e))
|
| 172 |
+
return ""
|
| 173 |
+
|
| 174 |
+
|
| 175 |
def setup_chrome_driver_options() -> Options:
|
| 176 |
"""
|
| 177 |
Create and configure Chrome driver options for web scraping.
|