Spaces:
Configuration error
Configuration error
| """ | |
| context_acquisition.py | |
| Functions for acquiring context from various sources including LinkedIn profiles, | |
| GitHub profiles, and job postings using browser automation. | |
| """ | |
| import time | |
| import logging | |
| import os | |
| from urllib.parse import urlparse | |
| from selenium import webdriver | |
| from selenium.webdriver.chrome.options import Options | |
| from selenium.webdriver.common.by import By | |
| from selenium.webdriver.support.ui import WebDriverWait | |
| from selenium.webdriver.support import expected_conditions as EC | |
| from selenium.common.exceptions import TimeoutException, WebDriverException | |
| # Set up logging | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| def get_linkedin_profile_html(profile_url: str, wait_time: int = 10) -> str: | |
| """ | |
| Retrieve the HTML content of a LinkedIn profile using browser automation. | |
| The HTML content is saved to the html directory and also returned. | |
| Args: | |
| profile_url (str): The URL of the LinkedIn profile to scrape | |
| wait_time (int): Maximum time to wait for page elements to load (default: 10 seconds) | |
| Returns: | |
| str: The HTML content of the LinkedIn profile page | |
| Raises: | |
| ValueError: If the URL is not a valid LinkedIn profile URL | |
| WebDriverException: If there's an issue with the browser automation | |
| TimeoutException: If the page takes too long to load | |
| Note: | |
| The HTML content is automatically saved to html/linkedin_profile_<name>_<timestamp>.html | |
| """ | |
| # Validate LinkedIn URL | |
| if not profile_url or not isinstance(profile_url, str): | |
| raise ValueError("Profile URL must be a non-empty string") | |
| if "linkedin.com/in/" not in profile_url: | |
| raise ValueError("URL must be a valid LinkedIn profile URL (containing 'linkedin.com/in/')") | |
| # Configure Chrome options for headless browsing | |
| chrome_options = setup_chrome_driver_options() | |
| driver = None | |
| try: | |
| # Initialize the Chrome driver | |
| logger.info("Initializing browser for URL: %s", profile_url) | |
| driver = webdriver.Chrome(options=chrome_options) | |
| driver.set_page_load_timeout(30) | |
| # Navigate to the LinkedIn profile | |
| logger.info("Navigating to LinkedIn profile...") | |
| driver.get(profile_url) | |
| # Wait for the page to load | |
| # Look for common LinkedIn profile elements | |
| wait = WebDriverWait(driver, wait_time) | |
| try: | |
| # Wait for either the main content or login prompt | |
| wait.until( | |
| EC.any_of( | |
| EC.presence_of_element_located(( # Profile header | |
| By.CSS_SELECTOR, | |
| ".pv-top-card" | |
| )), | |
| EC.presence_of_element_located(( # Profile section | |
| By.CSS_SELECTOR, | |
| ".profile-section" | |
| )), | |
| EC.presence_of_element_located(( # Auth wall | |
| By.CSS_SELECTOR, | |
| ".authwall" | |
| )), | |
| EC.presence_of_element_located(( # Public profile | |
| By.CSS_SELECTOR, | |
| ".public-profile" | |
| )), | |
| ) | |
| ) | |
| except TimeoutException: | |
| logger.warning( | |
| "Standard LinkedIn elements not found, proceeding with current page state" | |
| ) | |
| # Additional wait to ensure dynamic content loads | |
| time.sleep(2) | |
| # Get the page HTML | |
| html_content = driver.page_source | |
| # Clean up HTML by removing blank lines | |
| cleaned_html = _clean_html_content(html_content) | |
| logger.info( | |
| "Successfully retrieved HTML content (%d characters, cleaned to %d characters)", | |
| len(html_content), | |
| len(cleaned_html) | |
| ) | |
| # Save HTML content to file | |
| _save_html_to_file(cleaned_html, profile_url) | |
| return cleaned_html | |
| except WebDriverException as e: | |
| logger.error("WebDriver error occurred: %s", str(e)) | |
| raise WebDriverException(f"Browser automation failed: {str(e)}") from e | |
| except Exception as e: | |
| logger.error("Unexpected error occurred: %s", str(e)) | |
| raise RuntimeError(f"Failed to retrieve LinkedIn profile: {str(e)}") from e | |
| finally: | |
| # Always clean up the driver | |
| if driver: | |
| try: | |
| driver.quit() | |
| logger.info("Browser session closed") | |
| except WebDriverException as e: | |
| logger.warning("Error closing browser: %s", str(e)) | |
| def _clean_html_content(html_content: str) -> str: | |
| """ | |
| Clean HTML content by removing blank lines and excessive whitespace. | |
| Args: | |
| html_content (str): The raw HTML content to clean | |
| Returns: | |
| str: Cleaned HTML content with blank lines removed | |
| """ | |
| # Split into lines, strip whitespace, and filter out empty lines | |
| lines = html_content.split('\n') | |
| cleaned_lines = [line.rstrip() for line in lines if line.strip()] | |
| # Join back together with single newlines | |
| return '\n'.join(cleaned_lines) | |
| def _save_html_to_file(html_content: str, profile_url: str) -> str: | |
| """ | |
| Save HTML content to a file in the html directory. | |
| Args: | |
| html_content (str): The HTML content to save | |
| profile_url (str): The original profile URL for filename generation | |
| Returns: | |
| str: The path to the saved file | |
| """ | |
| try: | |
| # Create html directory if it doesn't exist | |
| html_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'html') | |
| os.makedirs(html_dir, exist_ok=True) | |
| # Generate filename from URL and timestamp | |
| parsed_url = urlparse(profile_url) | |
| profile_name = parsed_url.path.split('/')[2] or 'unknown_profile' | |
| filename = f"linkedin_profile_{profile_name}.html" | |
| # Full file path | |
| file_path = os.path.join(html_dir, filename) | |
| # Save HTML content | |
| with open(file_path, 'w', encoding='utf-8') as f: | |
| f.write(html_content) | |
| logger.info("HTML content saved to: %s", file_path) | |
| return file_path | |
| except Exception as e: # pylint: disable=broad-exception-caught | |
| logger.warning("Failed to save HTML content: %s", str(e)) | |
| return "" | |
| def setup_chrome_driver_options() -> Options: | |
| """ | |
| Create and configure Chrome driver options for web scraping. | |
| Returns: | |
| Options: Configured Chrome options object | |
| """ | |
| chrome_options = Options() | |
| chrome_options.add_argument("--headless") # Run in background | |
| chrome_options.add_argument("--no-sandbox") | |
| chrome_options.add_argument("--disable-dev-shm-usage") | |
| chrome_options.add_argument("--disable-gpu") | |
| chrome_options.add_argument("--window-size=1920,1080") | |
| chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + | |
| "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36") | |
| chrome_options.add_argument("--disable-blink-features=AutomationControlled") | |
| chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"]) | |
| chrome_options.add_experimental_option('useAutomationExtension', False) | |
| return chrome_options | |