Spaces:

gperdrizet
/

resumate

Configuration error

App Files Files

xet

Community

gperdrizet commited on Jul 8

Commit

7dcc57a

verified ·

1 Parent(s): 5826d5a

Fixed LinkedIn HTML file name.

Browse files

Files changed (1) hide show

functions/context_acquisition.py +29 -7

functions/context_acquisition.py CHANGED Viewed

@@ -8,7 +8,6 @@ GitHub profiles, and job postings using browser automation.
 import time
 import logging
 import os
-from datetime import datetime
 from urllib.parse import urlparse
 from selenium import webdriver
@@ -110,12 +109,16 @@ def get_linkedin_profile_html(profile_url: str, wait_time: int = 10) -> str:
         # Get the page HTML
         html_content = driver.page_source
-        logger.info("Successfully retrieved HTML content (%d characters)", len(html_content))
         # Save HTML content to file
-        _save_html_to_file(html_content, profile_url)
-        return html_content
     except WebDriverException as e:
         logger.error("WebDriver error occurred: %s", str(e))
@@ -135,6 +138,24 @@ def get_linkedin_profile_html(profile_url: str, wait_time: int = 10) -> str:
                 logger.warning("Error closing browser: %s", str(e))
 def _save_html_to_file(html_content: str, profile_url: str) -> str:
     """
     Save HTML content to a file in the html directory.
@@ -153,9 +174,10 @@ def _save_html_to_file(html_content: str, profile_url: str) -> str:
         # Generate filename from URL and timestamp
         parsed_url = urlparse(profile_url)
-        profile_name = parsed_url.path.split('/')[-1] or 'unknown_profile'
-        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-        filename = f"linkedin_profile_{profile_name}_{timestamp}.html"
         # Full file path
         file_path = os.path.join(html_dir, filename)

 import time
 import logging
 import os
 from urllib.parse import urlparse
 from selenium import webdriver
         # Get the page HTML
         html_content = driver.page_source
+        # Clean up HTML by removing blank lines
+        cleaned_html = _clean_html_content(html_content)
+        logger.info("Successfully retrieved HTML content (%d characters, cleaned to %d characters)",
+                   len(html_content), len(cleaned_html))
         # Save HTML content to file
+        _save_html_to_file(cleaned_html, profile_url)
+        return cleaned_html
     except WebDriverException as e:
         logger.error("WebDriver error occurred: %s", str(e))
                 logger.warning("Error closing browser: %s", str(e))
+def _clean_html_content(html_content: str) -> str:
+    """
+    Clean HTML content by removing blank lines and excessive whitespace.
+    Args:
+        html_content (str): The raw HTML content to clean
+    Returns:
+        str: Cleaned HTML content with blank lines removed
+    """
+    # Split into lines, strip whitespace, and filter out empty lines
+    lines = html_content.split('\n')
+    cleaned_lines = [line.rstrip() for line in lines if line.strip()]
+    # Join back together with single newlines
+    return '\n'.join(cleaned_lines)
 def _save_html_to_file(html_content: str, profile_url: str) -> str:
     """
     Save HTML content to a file in the html directory.
         # Generate filename from URL and timestamp
         parsed_url = urlparse(profile_url)
+        print(parsed_url)
+        profile_name = parsed_url.path.split('/')[2] or 'unknown_profile'
+        print(profile_name)
+        filename = f"linkedin_profile_{profile_name}.html"
         # Full file path
         file_path = os.path.join(html_dir, filename)