Spaces:

gperdrizet
/

resumate

Configuration error

gperdrizet commited on Jul 8

Commit

f1fa456

verified ·

1 Parent(s): 5ba8d84

Added LinkedIn profile scraping functions.

Browse files

Files changed (6) hide show

functions/__init__.py +10 -0
functions/__pycache__/__init__.cpython-310.pyc +0 -0
functions/__pycache__/context_acquisition.cpython-310.pyc +0 -0
functions/__pycache__/data_acquisition.cpython-310.pyc +0 -0
functions/context_acquisition.py +127 -0
resumate.py +22 -3

functions/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+"""
+Functions package for the resumate application.
+This package contains modules for data acquisition, processing, and analysis
+of LinkedIn profiles, GitHub profiles, and job postings.
+"""
+from .data_acquisition import get_linkedin_profile_html
+__all__ = ['get_linkedin_profile_html']

functions/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (422 Bytes). View file

functions/__pycache__/context_acquisition.cpython-310.pyc ADDED Viewed

Binary file (4.01 kB). View file

functions/__pycache__/data_acquisition.cpython-310.pyc ADDED Viewed

Binary file (4 kB). View file

functions/context_acquisition.py ADDED Viewed

	@@ -0,0 +1,127 @@

+"""
+context_acquisition.py
+Functions for acquiring context from various sources including LinkedIn profiles,
+GitHub profiles, and job postings using browser automation.
+"""
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.common.exceptions import TimeoutException, WebDriverException
+import time
+import logging
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def get_linkedin_profile_html(profile_url: str, wait_time: int = 10) -> str:
+    """
+    Retrieve the HTML content of a LinkedIn profile using browser automation.
+    Args:
+        profile_url (str): The URL of the LinkedIn profile to scrape
+        wait_time (int): Maximum time to wait for page elements to load (default: 10 seconds)
+    Returns:
+        str: The HTML content of the LinkedIn profile page
+    Raises:
+        ValueError: If the URL is not a valid LinkedIn profile URL
+        WebDriverException: If there's an issue with the browser automation
+        TimeoutException: If the page takes too long to load
+    """
+    # Validate LinkedIn URL
+    if not profile_url or not isinstance(profile_url, str):
+        raise ValueError("Profile URL must be a non-empty string")
+    if "linkedin.com/in/" not in profile_url:
+        raise ValueError("URL must be a valid LinkedIn profile URL (containing 'linkedin.com/in/')")
+    # Configure Chrome options for headless browsing
+    chrome_options = Options()
+    chrome_options.add_argument("--headless")  # Run in background
+    chrome_options.add_argument("--no-sandbox")
+    chrome_options.add_argument("--disable-dev-shm-usage")
+    chrome_options.add_argument("--disable-gpu")
+    chrome_options.add_argument("--window-size=1920,1080")
+    chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
+    driver = None
+    try:
+        # Initialize the Chrome driver
+        logger.info("Initializing browser for URL: %s", profile_url)
+        driver = webdriver.Chrome(options=chrome_options)
+        driver.set_page_load_timeout(30)
+        # Navigate to the LinkedIn profile
+        logger.info("Navigating to LinkedIn profile...")
+        driver.get(profile_url)
+        # Wait for the page to load
+        # Look for common LinkedIn profile elements
+        wait = WebDriverWait(driver, wait_time)
+        try:
+            # Wait for either the main content or login prompt
+            wait.until(
+                EC.any_of(
+                    EC.presence_of_element_located((By.CSS_SELECTOR, ".pv-top-card")),  # Profile header
+                    EC.presence_of_element_located((By.CSS_SELECTOR, ".profile-section")),  # Profile section
+                    EC.presence_of_element_located((By.CSS_SELECTOR, ".authwall")),  # Auth wall
+                    EC.presence_of_element_located((By.CSS_SELECTOR, ".public-profile")),  # Public profile
+                )
+            )
+        except TimeoutException:
+            logger.warning("Standard LinkedIn elements not found, proceeding with current page state")
+        # Additional wait to ensure dynamic content loads
+        time.sleep(2)
+        # Get the page HTML
+        html_content = driver.page_source
+        logger.info("Successfully retrieved HTML content (%d characters)", len(html_content))
+        return html_content
+    except WebDriverException as e:
+        logger.error(f"WebDriver error occurred: {str(e)}")
+        raise WebDriverException(f"Browser automation failed: {str(e)}")
+    except Exception as e:
+        logger.error(f"Unexpected error occurred: {str(e)}")
+        raise RuntimeError(f"Failed to retrieve LinkedIn profile: {str(e)}")
+    finally:
+        # Always clean up the driver
+        if driver:
+            try:
+                driver.quit()
+                logger.info("Browser session closed")
+            except Exception as e:
+                logger.warning(f"Error closing browser: {str(e)}")
+def setup_chrome_driver_options() -> Options:
+    """
+    Create and configure Chrome driver options for web scraping.
+    Returns:
+        Options: Configured Chrome options object
+    """
+    chrome_options = Options()
+    chrome_options.add_argument("--headless")
+    chrome_options.add_argument("--no-sandbox")
+    chrome_options.add_argument("--disable-dev-shm-usage")
+    chrome_options.add_argument("--disable-gpu")
+    chrome_options.add_argument("--window-size=1920,1080")
+    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
+    chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
+    chrome_options.add_experimental_option('useAutomationExtension', False)
+    return chrome_options

resumate.py CHANGED Viewed

@@ -15,14 +15,33 @@ To run:
 """
 import gradio as gr
 def process_inputs(linkedin_url, github_url, job_post_url):
     """
-    Placeholder function to process the input URLs.
-    Replace this docstring and logic with actual implementation as needed.
     """
-    return f"LinkedIn: {linkedin_url}\nGitHub: {github_url}\nJob Post: {job_post_url}"
 with gr.Blocks() as demo:
     gr.Markdown("# Resumate: Profile & Job Post Input")

 """
 import gradio as gr
+from functions.context_acquisition import get_linkedin_profile_html
 def process_inputs(linkedin_url, github_url, job_post_url):
     """
+    Process the input URLs and retrieve content from LinkedIn profile.
+    Args:
+        linkedin_url (str): LinkedIn profile URL
+        github_url (str): GitHub profile URL
+        job_post_url (str): LinkedIn job post URL
+    Returns:
+        str: Formatted output with URL information and LinkedIn profile status
     """
+    result = f"LinkedIn: {linkedin_url}\nGitHub: {github_url}\nJob Post: {job_post_url}\n\n"
+    # Try to retrieve LinkedIn profile HTML if URL is provided
+    if linkedin_url and linkedin_url.strip():
+        try:
+            result += "Attempting to retrieve LinkedIn profile...\n"
+            html_content = get_linkedin_profile_html(linkedin_url)
+            result += f"✅ Successfully retrieved LinkedIn profile HTML ({len(html_content)} characters)\n"
+        except Exception as e: # pylint: disable=broad-exception-caught
+            result += f"❌ Failed to retrieve LinkedIn profile: {str(e)}\n"
+    return result
 with gr.Blocks() as demo:
     gr.Markdown("# Resumate: Profile & Job Post Input")