Spaces:

gperdrizet
/

resumate

Configuration error

App Files Files Community

gperdrizet commited on Jul 9

Commit

68bd3e0

1 Parent(s): 0195b9e

Ditched idea to scrape biographic context from public LinkedIn profile. Will have users export profile and upload instead.

Browse files

Files changed (8) hide show

.devcontainer/devcontainer.json +1 -1
.gitignore +1 -2
functions/__init__.py +0 -10
functions/context_acquisition.py +2 -209
packages.txt +0 -1
requirements.txt +1 -3
resumate.py +28 -23
tests/test_context_acquisition.py +0 -249

.devcontainer/devcontainer.json CHANGED Viewed

@@ -3,7 +3,7 @@
 {
 	"name": "Python 3.10: resumate",
 	"image": "mcr.microsoft.com/devcontainers/python:0-3.11",
-	"onCreateCommand": "sudo apt update && sudo apt upgrade -y && sudo apt install -y chromium && pip3 install --upgrade pip && pip3 install --user -r requirements.txt",
 	"customizations": {
 	  "vscode": {
 		"extensions": [

 {
 	"name": "Python 3.10: resumate",
 	"image": "mcr.microsoft.com/devcontainers/python:0-3.11",
+	"onCreateCommand": "sudo apt update && sudo apt upgrade -y && pip3 install --upgrade pip && pip3 install --user -r requirements.txt",
 	"customizations": {
 	  "vscode": {
 		"extensions": [

.gitignore CHANGED Viewed

@@ -1,4 +1,3 @@
 __pycache__
 .vscode
-.venv
-html

 __pycache__
 .vscode
+.venv

functions/__init__.py DELETED Viewed

@@ -1,10 +0,0 @@
-"""
-Functions package for the resumate application.
-This package contains modules for data acquisition, processing, and analysis
-of LinkedIn profiles, GitHub profiles, and job postings.
-"""
-from .context_acquisition import get_linkedin_profile_html
-__all__ = ['get_linkedin_profile_html']

functions/context_acquisition.py CHANGED Viewed

@@ -1,210 +1,3 @@
 """
-context_acquisition.py
-Functions for acquiring context from various sources including LinkedIn profiles,
-GitHub profiles, and job postings using browser automation.
-"""
-import time
-import logging
-import os
-from urllib.parse import urlparse
-from selenium import webdriver
-from selenium.webdriver.chrome.options import Options
-from selenium.webdriver.common.by import By
-from selenium.webdriver.support.ui import WebDriverWait
-from selenium.webdriver.support import expected_conditions as EC
-from selenium.common.exceptions import TimeoutException, WebDriverException
-# Set up logging
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-def get_linkedin_profile_html(profile_url: str, wait_time: int = 10) -> str:
-    """
-    Retrieve the HTML content of a LinkedIn profile using browser automation.
-    The HTML content is saved to the html directory and also returned.
-    Args:
-        profile_url (str): The URL of the LinkedIn profile to scrape
-        wait_time (int): Maximum time to wait for page elements to load (default: 10 seconds)
-    Returns:
-        str: The HTML content of the LinkedIn profile page
-    Raises:
-        ValueError: If the URL is not a valid LinkedIn profile URL
-        WebDriverException: If there's an issue with the browser automation
-        TimeoutException: If the page takes too long to load
-    Note:
-        The HTML content is automatically saved to html/linkedin_profile_<name>_<timestamp>.html
-    """
-    # Validate LinkedIn URL
-    if not profile_url or not isinstance(profile_url, str):
-        raise ValueError("Profile URL must be a non-empty string")
-    if "linkedin.com/in/" not in profile_url:
-        raise ValueError("URL must be a valid LinkedIn profile URL (containing 'linkedin.com/in/')")
-    # Configure Chrome options for headless browsing
-    chrome_options = setup_chrome_driver_options()
-    driver = None
-    try:
-        # Initialize the Chrome driver
-        logger.info("Initializing browser for URL: %s", profile_url)
-        driver = webdriver.Chrome(options=chrome_options)
-        driver.set_page_load_timeout(30)
-        # Navigate to the LinkedIn profile
-        logger.info("Navigating to LinkedIn profile...")
-        driver.get(profile_url)
-        # Wait for the page to load
-        # Look for common LinkedIn profile elements
-        wait = WebDriverWait(driver, wait_time)
-        try:
-            # Wait for either the main content or login prompt
-            wait.until(
-                EC.any_of(
-                    EC.presence_of_element_located(( # Profile header
-                        By.CSS_SELECTOR,
-                        ".pv-top-card"
-                    )),
-                    EC.presence_of_element_located(( # Profile section
-                        By.CSS_SELECTOR,
-                        ".profile-section"
-                    )),
-                    EC.presence_of_element_located(( # Auth wall
-                        By.CSS_SELECTOR,
-                        ".authwall"
-                    )),
-                    EC.presence_of_element_located(( # Public profile
-                        By.CSS_SELECTOR,
-                        ".public-profile"
-                    )),
-                )
-            )
-        except TimeoutException:
-            logger.warning(
-                "Standard LinkedIn elements not found, proceeding with current page state"
-            )
-        # Additional wait to ensure dynamic content loads
-        time.sleep(2)
-        # Get the page HTML
-        html_content = driver.page_source
-        # Clean up HTML by removing blank lines
-        cleaned_html = _clean_html_content(html_content)
-        logger.info(
-            "Successfully retrieved HTML content (%d characters, cleaned to %d characters)",
-            len(html_content),
-            len(cleaned_html)
-        )
-        # Save HTML content to file
-        _save_html_to_file(cleaned_html, profile_url)
-        return cleaned_html
-    except WebDriverException as e:
-        logger.error("WebDriver error occurred: %s", str(e))
-        raise WebDriverException(f"Browser automation failed: {str(e)}") from e
-    except Exception as e:
-        logger.error("Unexpected error occurred: %s", str(e))
-        raise RuntimeError(f"Failed to retrieve LinkedIn profile: {str(e)}") from e
-    finally:
-        # Always clean up the driver
-        if driver:
-            try:
-                driver.quit()
-                logger.info("Browser session closed")
-            except WebDriverException as e:
-                logger.warning("Error closing browser: %s", str(e))
-def _clean_html_content(html_content: str) -> str:
-    """
-    Clean HTML content by removing blank lines and excessive whitespace.
-    Args:
-        html_content (str): The raw HTML content to clean
-    Returns:
-        str: Cleaned HTML content with blank lines removed
-    """
-    # Split into lines, strip whitespace, and filter out empty lines
-    lines = html_content.split('\n')
-    cleaned_lines = [line.rstrip() for line in lines if line.strip()]
-    # Join back together with single newlines
-    return '\n'.join(cleaned_lines)
-def _save_html_to_file(html_content: str, profile_url: str) -> str:
-    """
-    Save HTML content to a file in the html directory.
-    Args:
-        html_content (str): The HTML content to save
-        profile_url (str): The original profile URL for filename generation
-    Returns:
-        str: The path to the saved file
-    """
-    try:
-        # Create html directory if it doesn't exist
-        html_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'html')
-        os.makedirs(html_dir, exist_ok=True)
-        # Generate filename from URL and timestamp
-        parsed_url = urlparse(profile_url)
-        profile_name = parsed_url.path.split('/')[2] or 'unknown_profile'
-        filename = f"linkedin_profile_{profile_name}.html"
-        # Full file path
-        file_path = os.path.join(html_dir, filename)
-        # Save HTML content
-        with open(file_path, 'w', encoding='utf-8') as f:
-            f.write(html_content)
-        logger.info("HTML content saved to: %s", file_path)
-        return file_path
-    except Exception as e: # pylint: disable=broad-exception-caught
-        logger.warning("Failed to save HTML content: %s", str(e))
-        return ""
-def setup_chrome_driver_options() -> Options:
-    """
-    Create and configure Chrome driver options for web scraping.
-    Returns:
-        Options: Configured Chrome options object
-    """
-    chrome_options = Options()
-    chrome_options.add_argument("--headless")  # Run in background
-    chrome_options.add_argument("--no-sandbox")
-    chrome_options.add_argument("--disable-dev-shm-usage")
-    chrome_options.add_argument("--disable-gpu")
-    chrome_options.add_argument("--window-size=1920,1080")
-    chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) " +
-        "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
-    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
-    chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
-    chrome_options.add_experimental_option('useAutomationExtension', False)
-    return chrome_options

 """
+Functions for acquiring context from various sources.
+"""

packages.txt DELETED Viewed

	@@ -1 +0,0 @@
1	- chromium-driver

requirements.txt CHANGED Viewed

@@ -1,3 +1 @@
-gradio==5.35.0
-selenium>=4.0.0
-webdriver-manager>=3.8.0


1	+ gradio==5.35.0

resumate.py CHANGED Viewed

@@ -1,10 +1,10 @@
 """
 resumate.py
-A simple Gradio UI for collecting user profile and job post URLs.
-This app provides three text input fields for:
-- LinkedIn profile URL
 - GitHub profile URL
 - LinkedIn job post URL
@@ -15,39 +15,44 @@ To run:
 """
 import gradio as gr
-from functions.context_acquisition import get_linkedin_profile_html
-def process_inputs(linkedin_url, github_url, job_post_url):
     """
-    Process the input URLs and retrieve content from LinkedIn profile.
     Args:
-        linkedin_url (str): LinkedIn profile URL
         github_url (str): GitHub profile URL
         job_post_url (str): LinkedIn job post URL
     Returns:
-        str: Formatted output with URL information and LinkedIn profile status
     """
-    result = f"LinkedIn: {linkedin_url}\nGitHub: {github_url}\nJob Post: {job_post_url}\n\n"
-    # Try to retrieve LinkedIn profile HTML if URL is provided
-    if linkedin_url and linkedin_url.strip():
-        try:
-            result += "Attempting to retrieve LinkedIn profile...\n"
-            html_content = get_linkedin_profile_html(linkedin_url)
-            result += f"LinkedIn profile HTML ({len(html_content)} characters)\n"
-        except Exception as e: # pylint: disable=broad-exception-caught
-            result += f"❌ Failed to retrieve LinkedIn profile: {str(e)}\n"
     return result
 with gr.Blocks() as demo:
     gr.Markdown("# Resumate: Profile & Job Post Input")
-    linkedin_profile = gr.Textbox(
-        label="LinkedIn Profile URL",
-        placeholder="Enter your LinkedIn profile URL"
     )
     github_profile = gr.Textbox(
@@ -61,11 +66,11 @@ with gr.Blocks() as demo:
     )
     submit_btn = gr.Button("Submit")
-    output = gr.Textbox(label="Output", lines=3)
     submit_btn.click( # pylint: disable=no-member
         process_inputs,
-        inputs=[linkedin_profile, github_profile, job_post],
         outputs=output
     )

 """
 resumate.py
+A simple Gradio UI for collecting user profile and job post information.
+This app provides inputs for:
+- LinkedIn resume export PDF file upload
 - GitHub profile URL
 - LinkedIn job post URL
 """
 import gradio as gr
+def process_inputs(linkedin_pdf, github_url, job_post_url):
     """
+    Process the input files and URLs.
     Args:
+        linkedin_pdf: Uploaded LinkedIn resume export PDF file
         github_url (str): GitHub profile URL
         job_post_url (str): LinkedIn job post URL
     Returns:
+        str: Formatted output with file and URL information
     """
+    result = ""
+    # Process LinkedIn PDF file
+    if linkedin_pdf is not None:
+        result += f"✅ LinkedIn Resume PDF uploaded: {linkedin_pdf.name}\n"
+        result += f"   File size: {len(linkedin_pdf.read())} bytes\n\n"
+        # Reset file pointer for potential future use
+        linkedin_pdf.seek(0)
+    else:
+        result += "❌ No LinkedIn resume PDF file uploaded\n\n"
+    # Process other inputs
+    result += f"GitHub Profile: {github_url if github_url else 'Not provided'}\n"
+    result += f"Job Post URL: {job_post_url if job_post_url else 'Not provided'}\n"
     return result
 with gr.Blocks() as demo:
     gr.Markdown("# Resumate: Profile & Job Post Input")
+    linkedin_pdf = gr.File(
+        label="LinkedIn Resume Export PDF",
+        file_types=[".pdf"],
+        file_count="single"
     )
     github_profile = gr.Textbox(
     )
     submit_btn = gr.Button("Submit")
+    output = gr.Textbox(label="Output", lines=5)
     submit_btn.click( # pylint: disable=no-member
         process_inputs,
+        inputs=[linkedin_pdf, github_profile, job_post],
         outputs=output
     )

tests/test_context_acquisition.py CHANGED Viewed

@@ -1,252 +1,3 @@
 """
 Unit tests for the context_acquisition module.
 """
-import unittest
-import os
-import tempfile
-import shutil
-from selenium.webdriver.chrome.options import Options
-import functions.context_acquisition
-# Import the functions to test
-from functions.context_acquisition import (
-    _clean_html_content,
-    _save_html_to_file,
-    setup_chrome_driver_options
-)
-class TestCleanHTMLContent(unittest.TestCase):
-    """Test cases for the _clean_html_content function."""
-    def test_remove_blank_lines(self):
-        """Test removal of blank lines from HTML content."""
-        html_with_blanks = """<html>
-<head>
-    <title>Test</title>
-</head>
-<body>
-    <div>Content</div>
-</body>
-</html>"""
-        expected = """<html>
-<head>
-    <title>Test</title>
-</head>
-<body>
-    <div>Content</div>
-</body>
-</html>"""
-        result = _clean_html_content(html_with_blanks)
-        self.assertEqual(result, expected)
-    def test_strip_trailing_whitespace(self):
-        """Test removal of trailing whitespace from lines."""
-        html_with_trailing = "<div>Content</div>   \n<p>Text</p>\t\n"
-        expected = "<div>Content</div>\n<p>Text</p>"
-        result = _clean_html_content(html_with_trailing)
-        self.assertEqual(result, expected)
-    def test_empty_content(self):
-        """Test handling of empty or whitespace-only content."""
-        self.assertEqual(_clean_html_content(""), "")
-        self.assertEqual(_clean_html_content("   \n\n\t  "), "")
-        self.assertEqual(_clean_html_content("\n"), "")
-    def test_single_line_content(self):
-        """Test cleaning of single line content."""
-        single_line = "<html><body>Content</body></html>"
-        result = _clean_html_content(single_line)
-        self.assertEqual(result, single_line)
-    def test_mixed_whitespace(self):
-        """Test handling of mixed whitespace characters."""
-        mixed = "<div>\t\n  \n\r\n<p>Text</p>\n   \n</div>"
-        expected = "<div>\n<p>Text</p>\n</div>"
-        result = _clean_html_content(mixed)
-        self.assertEqual(result, expected)
-class TestSaveHTMLToFile(unittest.TestCase):
-    """Test cases for the _save_html_to_file function."""
-    def setUp(self):
-        """Set up test fixtures with temporary directory."""
-        self.test_dir = tempfile.mkdtemp()
-        self.test_html = "<html><body>Test content</body></html>"
-        self.test_url = "https://www.linkedin.com/in/johndoe"
-    def tearDown(self):
-        """Clean up temporary directory."""
-        if os.path.exists(self.test_dir):
-            shutil.rmtree(self.test_dir)
-    def test_successful_file_save(self):
-        """Test successful saving of HTML content to file."""
-        # Temporarily change the file path calculation
-        original_dirname = os.path.dirname
-        def mock_dirname(path):
-            if path.endswith('context_acquisition.py'):
-                return self.test_dir
-            return original_dirname(path)
-        # Replace os.path.dirname temporarily
-        original_func = functions.context_acquisition.os.path.dirname
-        functions.context_acquisition.os.path.dirname = mock_dirname
-        try:
-            result = _save_html_to_file(self.test_html, self.test_url)
-            # Verify file was created
-            self.assertTrue(os.path.exists(result))
-            self.assertTrue(result.endswith('.html'))
-            # Verify file content
-            with open(result, 'r', encoding='utf-8') as f:
-                content = f.read()
-                self.assertEqual(content, self.test_html)
-        finally:
-            # Restore original function
-            functions.context_acquisition.os.path.dirname = original_func
-class TestSetupChromeDriverOptions(unittest.TestCase):
-    """Test cases for the setup_chrome_driver_options function."""
-    def test_chrome_options_configuration(self):
-        """Test that Chrome options are properly configured."""
-        options = setup_chrome_driver_options()
-        # Verify that options object is returned
-        self.assertIsNotNone(options)
-        # Verify it's the correct type
-        self.assertIsInstance(options, Options)
-    def test_chrome_options_arguments(self):
-        """Test that required Chrome arguments are set."""
-        options = setup_chrome_driver_options()
-        # Access the arguments (this is implementation dependent)
-        # Note: This test verifies the function runs without error
-        # Specific argument verification would require accessing private attributes
-        self.assertIsNotNone(options)
-class TestURLValidation(unittest.TestCase):
-    """Test cases for URL validation logic (extracted from main function)."""
-    def test_valid_linkedin_urls(self):
-        """Test validation of valid LinkedIn URLs."""
-        valid_urls = [
-            "https://www.linkedin.com/in/johndoe",
-            "https://linkedin.com/in/jane-smith",
-            "http://www.linkedin.com/in/test123",
-            "https://www.linkedin.com/in/user-name-with-dashes",
-        ]
-        for url in valid_urls:
-            # Test the validation logic directly
-            self.assertTrue(isinstance(url, str))
-            self.assertTrue(url.strip())
-            self.assertIn("linkedin.com/in/", url)
-    def test_invalid_linkedin_urls(self):
-        """Test validation of invalid LinkedIn URLs."""
-        invalid_urls = [
-            "",
-            None,
-            "https://www.example.com/profile",
-            "https://www.linkedin.com/company/test",
-            "https://github.com/user",
-            "not-a-url",
-        ]
-        for url in invalid_urls:
-            # Test the validation logic directly
-            if url is None or not isinstance(url, str):
-                self.assertTrue(url is None or not isinstance(url, str))
-            elif not url.strip():
-                self.assertFalse(url.strip())
-            else:
-                self.assertNotIn("linkedin.com/in/", url)
-class TestHTMLContentProcessing(unittest.TestCase):
-    """Test cases for HTML content processing workflows."""
-    def test_html_cleaning_workflow(self):
-        """Test the complete HTML cleaning workflow."""
-        raw_html = """<!DOCTYPE html>
-<html>
-<head>
-    <title>LinkedIn Profile</title>
-</head>
-<body>
-    <div class="profile">
-        <h1>John Doe</h1>
-        <p>Software Engineer</p>
-    </div>
-</body>
-</html>"""
-        cleaned = _clean_html_content(raw_html)
-        # Verify no empty lines
-        lines = cleaned.split('\n')
-        for line in lines:
-            self.assertTrue(line.strip(), f"Found empty line: '{line}'")
-        # Verify content is preserved
-        self.assertIn("John Doe", cleaned)
-        self.assertIn("Software Engineer", cleaned)
-        self.assertIn("LinkedIn Profile", cleaned)
-    def test_minimal_html_cleaning(self):
-        """Test cleaning of minimal HTML content."""
-        minimal_html = "<html><body>Content</body></html>"
-        result = _clean_html_content(minimal_html)
-        self.assertEqual(result, minimal_html)
-    def test_complex_whitespace_patterns(self):
-        """Test cleaning of complex whitespace patterns."""
-        complex_html = """<div>
-\t\t
-    <span>Text</span>
-\t
-<p>Paragraph</p>
-   \t
-</div>"""
-        result = _clean_html_content(complex_html)
-        lines = result.split('\n')
-        # Should have no empty lines
-        for line in lines:
-            self.assertTrue(line.strip())
-        # Should preserve content
-        self.assertIn("Text", result)
-        self.assertIn("Paragraph", result)
-if __name__ == '__main__':
-    unittest.main()

 """
 Unit tests for the context_acquisition module.
 """