Spaces:

gperdrizet
/

resumate

Configuration error

App Files Files Community

gperdrizet commited on Jul 8

Commit

4f75930

verified ·

1 Parent(s): df6f062

Added function to save linkedin profile html.

Browse files

Files changed (2) hide show

.gitignore +2 -1
functions/context_acquisition.py +52 -1

.gitignore CHANGED Viewed

@@ -1,3 +1,4 @@
 __pycache__
 .vscode
-.venv

 __pycache__
 .vscode
+.venv
+html

functions/context_acquisition.py CHANGED Viewed

@@ -7,6 +7,9 @@ GitHub profiles, and job postings using browser automation.
 import time
 import logging
 from selenium import webdriver
 from selenium.webdriver.chrome.options import Options
@@ -23,6 +26,7 @@ logger = logging.getLogger(__name__)
 def get_linkedin_profile_html(profile_url: str, wait_time: int = 10) -> str:
     """
     Retrieve the HTML content of a LinkedIn profile using browser automation.
     Args:
         profile_url (str): The URL of the LinkedIn profile to scrape
@@ -35,6 +39,9 @@ def get_linkedin_profile_html(profile_url: str, wait_time: int = 10) -> str:
         ValueError: If the URL is not a valid LinkedIn profile URL
         WebDriverException: If there's an issue with the browser automation
         TimeoutException: If the page takes too long to load
     """
     # Validate LinkedIn URL
@@ -91,8 +98,11 @@ def get_linkedin_profile_html(profile_url: str, wait_time: int = 10) -> str:
                     )),
                 )
             )
         except TimeoutException:
-            logger.warning("Standard LinkedIn elements not found, proceeding with current page state")
         # Additional wait to ensure dynamic content loads
         time.sleep(2)
@@ -101,6 +111,10 @@ def get_linkedin_profile_html(profile_url: str, wait_time: int = 10) -> str:
         html_content = driver.page_source
         logger.info("Successfully retrieved HTML content (%d characters)", len(html_content))
         return html_content
     except WebDriverException as e:
@@ -121,6 +135,43 @@ def get_linkedin_profile_html(profile_url: str, wait_time: int = 10) -> str:
                 logger.warning("Error closing browser: %s", str(e))
 def setup_chrome_driver_options() -> Options:
     """
     Create and configure Chrome driver options for web scraping.

 import time
 import logging
+import os
+from datetime import datetime
+from urllib.parse import urlparse
 from selenium import webdriver
 from selenium.webdriver.chrome.options import Options
 def get_linkedin_profile_html(profile_url: str, wait_time: int = 10) -> str:
     """
     Retrieve the HTML content of a LinkedIn profile using browser automation.
+    The HTML content is saved to the html directory and also returned.
     Args:
         profile_url (str): The URL of the LinkedIn profile to scrape
         ValueError: If the URL is not a valid LinkedIn profile URL
         WebDriverException: If there's an issue with the browser automation
         TimeoutException: If the page takes too long to load
+    Note:
+        The HTML content is automatically saved to html/linkedin_profile_<name>_<timestamp>.html
     """
     # Validate LinkedIn URL
                     )),
                 )
             )
         except TimeoutException:
+            logger.warning(
+                "Standard LinkedIn elements not found, proceeding with current page state"
+            )
         # Additional wait to ensure dynamic content loads
         time.sleep(2)
         html_content = driver.page_source
         logger.info("Successfully retrieved HTML content (%d characters)", len(html_content))
+        # Save HTML content to file
+        _save_html_to_file(html_content, profile_url)
         return html_content
     except WebDriverException as e:
                 logger.warning("Error closing browser: %s", str(e))
+def _save_html_to_file(html_content: str, profile_url: str) -> str:
+    """
+    Save HTML content to a file in the html directory.
+    Args:
+        html_content (str): The HTML content to save
+        profile_url (str): The original profile URL for filename generation
+    Returns:
+        str: The path to the saved file
+    """
+    try:
+        # Create html directory if it doesn't exist
+        html_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'html')
+        os.makedirs(html_dir, exist_ok=True)
+        # Generate filename from URL and timestamp
+        parsed_url = urlparse(profile_url)
+        profile_name = parsed_url.path.split('/')[-1] or 'unknown_profile'
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        filename = f"linkedin_profile_{profile_name}_{timestamp}.html"
+        # Full file path
+        file_path = os.path.join(html_dir, filename)
+        # Save HTML content
+        with open(file_path, 'w', encoding='utf-8') as f:
+            f.write(html_content)
+        logger.info("HTML content saved to: %s", file_path)
+        return file_path
+    except Exception as e: # pylint: disable=broad-exception-caught
+        logger.warning("Failed to save HTML content: %s", str(e))
+        return ""
 def setup_chrome_driver_options() -> Options:
     """
     Create and configure Chrome driver options for web scraping.