Spaces:
Configuration error
Configuration error
""" | |
context_acquisition.py | |
Functions for acquiring context from various sources including LinkedIn profiles, | |
GitHub profiles, and job postings using browser automation. | |
""" | |
import time | |
import logging | |
import os | |
from urllib.parse import urlparse | |
from selenium import webdriver | |
from selenium.webdriver.chrome.options import Options | |
from selenium.webdriver.common.by import By | |
from selenium.webdriver.support.ui import WebDriverWait | |
from selenium.webdriver.support import expected_conditions as EC | |
from selenium.common.exceptions import TimeoutException, WebDriverException | |
# Set up logging | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
def get_linkedin_profile_html(profile_url: str, wait_time: int = 10) -> str: | |
""" | |
Retrieve the HTML content of a LinkedIn profile using browser automation. | |
The HTML content is saved to the html directory and also returned. | |
Args: | |
profile_url (str): The URL of the LinkedIn profile to scrape | |
wait_time (int): Maximum time to wait for page elements to load (default: 10 seconds) | |
Returns: | |
str: The HTML content of the LinkedIn profile page | |
Raises: | |
ValueError: If the URL is not a valid LinkedIn profile URL | |
WebDriverException: If there's an issue with the browser automation | |
TimeoutException: If the page takes too long to load | |
Note: | |
The HTML content is automatically saved to html/linkedin_profile_<name>_<timestamp>.html | |
""" | |
# Validate LinkedIn URL | |
if not profile_url or not isinstance(profile_url, str): | |
raise ValueError("Profile URL must be a non-empty string") | |
if "linkedin.com/in/" not in profile_url: | |
raise ValueError("URL must be a valid LinkedIn profile URL (containing 'linkedin.com/in/')") | |
# Configure Chrome options for headless browsing | |
chrome_options = Options() | |
chrome_options.add_argument("--headless") # Run in background | |
chrome_options.add_argument("--no-sandbox") | |
chrome_options.add_argument("--disable-dev-shm-usage") | |
chrome_options.add_argument("--disable-gpu") | |
chrome_options.add_argument("--window-size=1920,1080") | |
chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + | |
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36") | |
driver = None | |
try: | |
# Initialize the Chrome driver | |
logger.info("Initializing browser for URL: %s", profile_url) | |
driver = webdriver.Chrome(options=chrome_options) | |
driver.set_page_load_timeout(30) | |
# Navigate to the LinkedIn profile | |
logger.info("Navigating to LinkedIn profile...") | |
driver.get(profile_url) | |
# Wait for the page to load | |
# Look for common LinkedIn profile elements | |
wait = WebDriverWait(driver, wait_time) | |
try: | |
# Wait for either the main content or login prompt | |
wait.until( | |
EC.any_of( | |
EC.presence_of_element_located(( # Profile header | |
By.CSS_SELECTOR, | |
".pv-top-card" | |
)), | |
EC.presence_of_element_located(( # Profile section | |
By.CSS_SELECTOR, | |
".profile-section" | |
)), | |
EC.presence_of_element_located(( # Auth wall | |
By.CSS_SELECTOR, | |
".authwall" | |
)), | |
EC.presence_of_element_located(( # Public profile | |
By.CSS_SELECTOR, | |
".public-profile" | |
)), | |
) | |
) | |
except TimeoutException: | |
logger.warning( | |
"Standard LinkedIn elements not found, proceeding with current page state" | |
) | |
# Additional wait to ensure dynamic content loads | |
time.sleep(2) | |
# Get the page HTML | |
html_content = driver.page_source | |
# Clean up HTML by removing blank lines | |
cleaned_html = _clean_html_content(html_content) | |
logger.info("Successfully retrieved HTML content (%d characters, cleaned to %d characters)", | |
len(html_content), len(cleaned_html)) | |
# Save HTML content to file | |
_save_html_to_file(cleaned_html, profile_url) | |
return cleaned_html | |
except WebDriverException as e: | |
logger.error("WebDriver error occurred: %s", str(e)) | |
raise WebDriverException(f"Browser automation failed: {str(e)}") from e | |
except Exception as e: | |
logger.error("Unexpected error occurred: %s", str(e)) | |
raise RuntimeError(f"Failed to retrieve LinkedIn profile: {str(e)}") from e | |
finally: | |
# Always clean up the driver | |
if driver: | |
try: | |
driver.quit() | |
logger.info("Browser session closed") | |
except WebDriverException as e: | |
logger.warning("Error closing browser: %s", str(e)) | |
def _clean_html_content(html_content: str) -> str: | |
""" | |
Clean HTML content by removing blank lines and excessive whitespace. | |
Args: | |
html_content (str): The raw HTML content to clean | |
Returns: | |
str: Cleaned HTML content with blank lines removed | |
""" | |
# Split into lines, strip whitespace, and filter out empty lines | |
lines = html_content.split('\n') | |
cleaned_lines = [line.rstrip() for line in lines if line.strip()] | |
# Join back together with single newlines | |
return '\n'.join(cleaned_lines) | |
def _save_html_to_file(html_content: str, profile_url: str) -> str: | |
""" | |
Save HTML content to a file in the html directory. | |
Args: | |
html_content (str): The HTML content to save | |
profile_url (str): The original profile URL for filename generation | |
Returns: | |
str: The path to the saved file | |
""" | |
try: | |
# Create html directory if it doesn't exist | |
html_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'html') | |
os.makedirs(html_dir, exist_ok=True) | |
# Generate filename from URL and timestamp | |
parsed_url = urlparse(profile_url) | |
print(parsed_url) | |
profile_name = parsed_url.path.split('/')[2] or 'unknown_profile' | |
print(profile_name) | |
filename = f"linkedin_profile_{profile_name}.html" | |
# Full file path | |
file_path = os.path.join(html_dir, filename) | |
# Save HTML content | |
with open(file_path, 'w', encoding='utf-8') as f: | |
f.write(html_content) | |
logger.info("HTML content saved to: %s", file_path) | |
return file_path | |
except Exception as e: # pylint: disable=broad-exception-caught | |
logger.warning("Failed to save HTML content: %s", str(e)) | |
return "" | |
def setup_chrome_driver_options() -> Options: | |
""" | |
Create and configure Chrome driver options for web scraping. | |
Returns: | |
Options: Configured Chrome options object | |
""" | |
chrome_options = Options() | |
chrome_options.add_argument("--headless") | |
chrome_options.add_argument("--no-sandbox") | |
chrome_options.add_argument("--disable-dev-shm-usage") | |
chrome_options.add_argument("--disable-gpu") | |
chrome_options.add_argument("--window-size=1920,1080") | |
chrome_options.add_argument("--disable-blink-features=AutomationControlled") | |
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"]) | |
chrome_options.add_experimental_option('useAutomationExtension', False) | |
return chrome_options | |