Spaces:

gperdrizet
/

resumate

Configuration error

App Files Files Community

gperdrizet commited on Jul 9

Commit

cbb592a

2 Parent(s): 1225f1c 3e6ea2e

Fixed merge conflict

Browse files

Files changed (8) hide show

.devcontainer/devcontainer.json +1 -1
.gitignore +1 -2
functions/__init__.py +0 -10
functions/context_acquisition.py +266 -166
packages.txt +0 -1
requirements.txt +1 -2
resumate.py +50 -22
tests/test_context_acquisition.py +0 -249

.devcontainer/devcontainer.json CHANGED Viewed

@@ -3,7 +3,7 @@
 {
 	"name": "Python 3.10: resumate",
 	"image": "mcr.microsoft.com/devcontainers/python:0-3.11",
-	"onCreateCommand": "sudo apt update && sudo apt upgrade -y && sudo apt install -y chromium && pip3 install --upgrade pip && pip3 install --user -r requirements.txt",
 	"customizations": {
 	  "vscode": {
 		"extensions": [

 {
 	"name": "Python 3.10: resumate",
 	"image": "mcr.microsoft.com/devcontainers/python:0-3.11",
+	"onCreateCommand": "sudo apt update && sudo apt upgrade -y && pip3 install --upgrade pip && pip3 install --user -r requirements.txt",
 	"customizations": {
 	  "vscode": {
 		"extensions": [

.gitignore CHANGED Viewed

@@ -1,4 +1,3 @@
 __pycache__
 .vscode
-.venv
-html

 __pycache__
 .vscode
+.venv

functions/__init__.py DELETED Viewed

@@ -1,10 +0,0 @@
-"""
-Functions package for the resumate application.
-This package contains modules for data acquisition, processing, and analysis
-of LinkedIn profiles, GitHub profiles, and job postings.
-"""
-from .context_acquisition import get_linkedin_profile_html
-__all__ = ['get_linkedin_profile_html']

functions/context_acquisition.py CHANGED Viewed

@@ -1,210 +1,310 @@
 """
 context_acquisition.py
-Functions for acquiring context from various sources including LinkedIn profiles,
-GitHub profiles, and job postings using browser automation.
 """
-import time
 import logging
 import os
-from urllib.parse import urlparse
-from selenium import webdriver
-from selenium.webdriver.chrome.options import Options
-from selenium.webdriver.common.by import By
-from selenium.webdriver.support.ui import WebDriverWait
-from selenium.webdriver.support import expected_conditions as EC
-from selenium.common.exceptions import TimeoutException, WebDriverException
 # Set up logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-def get_linkedin_profile_html(profile_url: str, wait_time: int = 10) -> str:
     """
-    Retrieve the HTML content of a LinkedIn profile using browser automation.
-    The HTML content is saved to the html directory and also returned.
     Args:
-        profile_url (str): The URL of the LinkedIn profile to scrape
-        wait_time (int): Maximum time to wait for page elements to load (default: 10 seconds)
-    Returns:
-        str: The HTML content of the LinkedIn profile page
-    Raises:
-        ValueError: If the URL is not a valid LinkedIn profile URL
-        WebDriverException: If there's an issue with the browser automation
-        TimeoutException: If the page takes too long to load
-    Note:
-        The HTML content is automatically saved to html/linkedin_profile_<name>_<timestamp>.html
     """
-    # Validate LinkedIn URL
-    if not profile_url or not isinstance(profile_url, str):
-        raise ValueError("Profile URL must be a non-empty string")
-    if "linkedin.com/in/" not in profile_url:
-        raise ValueError("URL must be a valid LinkedIn profile URL (containing 'linkedin.com/in/')")
-    # Configure Chrome options for headless browsing
-    chrome_options = setup_chrome_driver_options()
-    driver = None
     try:
-        # Initialize the Chrome driver
-        logger.info("Initializing browser for URL: %s", profile_url)
-        driver = webdriver.Chrome(options=chrome_options)
-        driver.set_page_load_timeout(30)
-        # Navigate to the LinkedIn profile
-        logger.info("Navigating to LinkedIn profile...")
-        driver.get(profile_url)
-        # Wait for the page to load
-        # Look for common LinkedIn profile elements
-        wait = WebDriverWait(driver, wait_time)
-        try:
-            # Wait for either the main content or login prompt
-            wait.until(
-                EC.any_of(
-                    EC.presence_of_element_located(( # Profile header
-                        By.CSS_SELECTOR,
-                        ".pv-top-card"
-                    )),
-                    EC.presence_of_element_located(( # Profile section
-                        By.CSS_SELECTOR,
-                        ".profile-section"
-                    )),
-                    EC.presence_of_element_located(( # Auth wall
-                        By.CSS_SELECTOR,
-                        ".authwall"
-                    )),
-                    EC.presence_of_element_located(( # Public profile
-                        By.CSS_SELECTOR,
-                        ".public-profile"
-                    )),
-                )
-            )
-        except TimeoutException:
-            logger.warning(
-                "Standard LinkedIn elements not found, proceeding with current page state"
-            )
-        # Additional wait to ensure dynamic content loads
-        time.sleep(2)
-        # Get the page HTML
-        html_content = driver.page_source
-        # Clean up HTML by removing blank lines
-        cleaned_html = _clean_html_content(html_content)
-        logger.info(
-            "Successfully retrieved HTML content (%d characters, cleaned to %d characters)",
-            len(html_content),
-            len(cleaned_html)
-        )
-        # Save HTML content to file
-        _save_html_to_file(cleaned_html, profile_url)
-        return cleaned_html
-    except WebDriverException as e:
-        logger.error("WebDriver error occurred: %s", str(e))
-        raise WebDriverException(f"Browser automation failed: {str(e)}") from e
-    except Exception as e:
-        logger.error("Unexpected error occurred: %s", str(e))
-        raise RuntimeError(f"Failed to retrieve LinkedIn profile: {str(e)}") from e
-    finally:
-        # Always clean up the driver
-        if driver:
             try:
-                driver.quit()
-                logger.info("Browser session closed")
-            except WebDriverException as e:
-                logger.warning("Error closing browser: %s", str(e))
-def _clean_html_content(html_content: str) -> str:
     """
-    Clean HTML content by removing blank lines and excessive whitespace.
     Args:
-        html_content (str): The raw HTML content to clean
     Returns:
-        str: Cleaned HTML content with blank lines removed
     """
-    # Split into lines, strip whitespace, and filter out empty lines
-    lines = html_content.split('\n')
-    cleaned_lines = [line.rstrip() for line in lines if line.strip()]
-    # Join back together with single newlines
-    return '\n'.join(cleaned_lines)
-def _save_html_to_file(html_content: str, profile_url: str) -> str:
     """
-    Save HTML content to a file in the html directory.
     Args:
-        html_content (str): The HTML content to save
-        profile_url (str): The original profile URL for filename generation
     Returns:
-        str: The path to the saved file
     """
-    try:
-        # Create html directory if it doesn't exist
-        html_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'html')
-        os.makedirs(html_dir, exist_ok=True)
-        # Generate filename from URL and timestamp
-        parsed_url = urlparse(profile_url)
-        profile_name = parsed_url.path.split('/')[2] or 'unknown_profile'
-        filename = f"linkedin_profile_{profile_name}.html"
-        # Full file path
-        file_path = os.path.join(html_dir, filename)
-        # Save HTML content
-        with open(file_path, 'w', encoding='utf-8') as f:
-            f.write(html_content)
-        logger.info("HTML content saved to: %s", file_path)
-        return file_path
-    except Exception as e: # pylint: disable=broad-exception-caught
-        logger.warning("Failed to save HTML content: %s", str(e))
         return ""
-def setup_chrome_driver_options() -> Options:
     """
-    Create and configure Chrome driver options for web scraping.
     Returns:
-        Options: Configured Chrome options object
     """
-    chrome_options = Options()
-    chrome_options.add_argument("--headless")  # Run in background
-    chrome_options.add_argument("--no-sandbox")
-    chrome_options.add_argument("--disable-dev-shm-usage")
-    chrome_options.add_argument("--disable-gpu")
-    chrome_options.add_argument("--window-size=1920,1080")
-    chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) " +
-        "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
-    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
-    chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
-    chrome_options.add_experimental_option('useAutomationExtension', False)
-    return chrome_options

 """
 context_acquisition.py
+Functions for acquiring context from various sources including PDF text extraction,
+GitHub profiles, and job posting text.
 """
+import re
 import logging
+import io
 import os
+import PyPDF2
 # Set up logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+def extract_text_from_linkedin_pdf(pdf_file) -> dict:
     """
+    Extract and structure text content from an uploaded LinkedIn resume export PDF file
+    for optimal LLM processing.
     Args:
+        pdf_file: The file path string to the uploaded PDF file
+    Returns:
+        dict: Dictionary containing extraction status, structured text content, and metadata
+    Example:
+        {
+            "status": "success",
+            "structured_text": {
+                "sections": {...},
+                "full_text": "...",
+                "llm_formatted": "...",
+                "summary": "..."
+            },
+            "metadata": {...}
+        }
     """
+    if pdf_file is None:
+        return {"status": "error", "message": "No PDF file provided"}
     try:
+        # Get filename from path
+        filename = os.path.basename(pdf_file)
+        # Read the PDF file from the file path
+        with open(pdf_file, 'rb') as file:
+            file_content = file.read()
+            file_size = len(file_content)
+        # Create PDF reader from the file content
+        pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_content))
+        # Extract text from all pages
+        extracted_text = ""
+        num_pages = len(pdf_reader.pages)
+        for page_num in range(num_pages):
             try:
+                page = pdf_reader.pages[page_num]
+                page_text = page.extract_text()
+                extracted_text += page_text + "\n\n"
+            except Exception as e:
+                logger.warning(f"Error extracting text from page {page_num + 1}: {str(e)}")
+                continue
+        # Clean and structure the extracted text for LLM consumption
+        structured_content = _structure_resume_text(extracted_text)
+        if not structured_content["full_text"].strip():
+            return {
+                "status": "warning",
+                "structured_text": structured_content,
+                "metadata": {
+                    "filename": filename,
+                    "file_size": file_size,
+                    "pages": num_pages
+                },
+                "message": "PDF processed but no text content was extracted"
+            }
+        logger.info(f"Successfully extracted and structured {len(structured_content['full_text'])} characters from {filename}")
+        return {
+            "status": "success",
+            "structured_text": structured_content,
+            "metadata": {
+                "filename": filename,
+                "file_size": file_size,
+                "pages": num_pages,
+                "sections_found": list(structured_content["sections"].keys())
+            },
+            "message": f"Text extracted and structured successfully from {num_pages} pages"
+        }
+    except Exception as e:
+        logger.error(f"Error processing PDF file: {str(e)}")
+        return {
+            "status": "error",
+            "message": f"Failed to extract text from PDF: {str(e)}"
+        }
+def _structure_resume_text(text: str) -> dict:
     """
+    Structure resume text into logical sections for optimal LLM processing.
     Args:
+        text (str): Raw extracted text from PDF
     Returns:
+        dict: Structured text with sections, full text, and summary
     """
+    if not text:
+        return {
+            "sections": {},
+            "full_text": "",
+            "llm_formatted": "",
+            "summary": "",
+            "format": "structured_resume",
+            "word_count": 0,
+            "section_count": 0
+        }
+    # Clean the text first
+    cleaned_text = _clean_extracted_text(text)
+    # Define section patterns (common LinkedIn export sections)
+    section_patterns = {
+        "contact_info": r"(?i)(contact|personal|profile)\s*(?:information)?",
+        "summary": r"(?i)(summary|about|overview|profile)",
+        "experience": r"(?i)(experience|work|employment|professional)",
+        "education": r"(?i)(education|academic|university|college|school)",
+        "skills": r"(?i)(skills|competencies|technologies|technical)",
+        "certifications": r"(?i)(certification|certificate|license)",
+        "projects": r"(?i)(project|portfolio)",
+        "achievements": r"(?i)(achievement|award|honor|recognition)",
+        "languages": r"(?i)(language|linguistic)",
+        "volunteer": r"(?i)(volunteer|community|charity)"
+    }
+    # Split text into lines for processing
+    lines = cleaned_text.split('\n')
+    sections = {}
+    current_section = "general"
+    current_content = []
+    for line in lines:
+        line = line.strip()
+        if not line:
+            continue
+        # Check if line is a section header
+        section_found = None
+        for section_name, pattern in section_patterns.items():
+            if re.match(pattern, line):
+                section_found = section_name
+                break
+        if section_found:
+            # Save previous section content
+            if current_content:
+                sections[current_section] = '\n'.join(current_content)
+            # Start new section
+            current_section = section_found
+            current_content = [line]
+        else:
+            current_content.append(line)
+    # Save the last section
+    if current_content:
+        sections[current_section] = '\n'.join(current_content)
+    # Create a structured summary for LLM context
+    summary_parts = []
+    if "contact_info" in sections:
+        summary_parts.append(f"CONTACT: {sections['contact_info'][:200]}...")
+    if "summary" in sections:
+        summary_parts.append(f"SUMMARY: {sections['summary']}")
+    if "experience" in sections:
+        summary_parts.append(f"EXPERIENCE: {sections['experience'][:300]}...")
+    if "education" in sections:
+        summary_parts.append(f"EDUCATION: {sections['education']}")
+    if "skills" in sections:
+        summary_parts.append(f"SKILLS: {sections['skills']}")
+    # Create LLM-optimized format
+    llm_formatted_text = _format_for_llm(sections, cleaned_text)
+    return {
+        "sections": sections,
+        "full_text": cleaned_text,
+        "llm_formatted": llm_formatted_text,
+        "summary": '\n\n'.join(summary_parts),
+        "format": "structured_resume",
+        "word_count": len(cleaned_text.split()),
+        "section_count": len(sections)
+    }
+def _format_for_llm(sections: dict, full_text: str) -> str:
     """
+    Format the resume sections in an optimal way for LLM processing.
     Args:
+        sections (dict): Structured sections
+        full_text (str): Full cleaned text
     Returns:
+        str: LLM-optimized formatted text
     """
+    formatted_parts = ["=== RESUME CONTENT ===\n"]
+    # Prioritize sections in logical order for LLM
+    priority_order = ["summary", "contact_info", "experience", "education", "skills",
+                     "certifications", "projects", "achievements", "languages", "volunteer"]
+    # Add prioritized sections
+    for section_name in priority_order:
+        if section_name in sections:
+            formatted_parts.append(f"[{section_name.upper().replace('_', ' ')}]")
+            formatted_parts.append(sections[section_name])
+            formatted_parts.append("")  # Empty line between sections
+    # Add any remaining sections
+    for section_name, content in sections.items():
+        if section_name not in priority_order and section_name != "general":
+            formatted_parts.append(f"[{section_name.upper().replace('_', ' ')}]")
+            formatted_parts.append(content)
+            formatted_parts.append("")
+    # Add general content if exists
+    if "general" in sections:
+        formatted_parts.append("[ADDITIONAL INFORMATION]")
+        formatted_parts.append(sections["general"])
+    formatted_parts.append("\n=== END RESUME ===")
+    return '\n'.join(formatted_parts)
+def _clean_extracted_text(text: str) -> str:
+    """
+    Clean and normalize extracted text from PDF for better LLM processing.
+    Args:
+        text (str): Raw extracted text
+    Returns:
+        str: Cleaned text optimized for LLM consumption
+    """
+    if not text:
         return ""
+    # Remove excessive whitespace and normalize line endings
+    text = re.sub(r'\r\n', '\n', text)
+    text = re.sub(r'\r', '\n', text)
+    # Split into lines and clean each line
+    lines = text.split('\n')
+    cleaned_lines = []
+    for line in lines:
+        # Strip whitespace
+        cleaned_line = line.strip()
+        # Skip empty lines and very short lines (likely artifacts)
+        if len(cleaned_line) < 2:
+            continue
+        # Remove common PDF artifacts
+        cleaned_line = re.sub(r'^\d+$', '', cleaned_line)  # Page numbers
+        cleaned_line = re.sub(r'^[|\-_=]+$', '', cleaned_line)  # Separator lines
+        if cleaned_line:
+            cleaned_lines.append(cleaned_line)
+    # Join lines and normalize spacing
+    cleaned_text = '\n'.join(cleaned_lines)
+    # Normalize multiple spaces to single spaces
+    cleaned_text = re.sub(r' +', ' ', cleaned_text)
+    # Normalize multiple newlines to maximum of 2
+    cleaned_text = re.sub(r'\n{3,}', '\n\n', cleaned_text)
+    return cleaned_text.strip()
+def get_llm_context_from_resume(extraction_result: dict) -> str:
     """
+    Extract the best formatted text for LLM context from the extraction result.
+    Args:
+        extraction_result (dict): Result from extract_text_from_linkedin_pdf
     Returns:
+        str: Formatted text ready for LLM context
     """
+    if extraction_result.get("status") != "success":
+        return ""
+    structured_text = extraction_result.get("structured_text", {})
+    # Return the LLM-formatted version if available, otherwise fall back to full text
+    return structured_text.get("llm_formatted", structured_text.get("full_text", ""))

packages.txt DELETED Viewed

	@@ -1 +0,0 @@
1	- chromium

requirements.txt CHANGED Viewed

@@ -1,3 +1,2 @@
 gradio==5.35.0
-selenium>=4.0.0
-webdriver-manager>=3.8.0


1	gradio==5.35.0
2	+ PyPDF2==3.0.1

resumate.py CHANGED Viewed

@@ -1,10 +1,10 @@
 """
 resumate.py
-A simple Gradio UI for collecting user profile and job post URLs.
-This app provides three text input fields for:
-- LinkedIn profile URL
 - GitHub profile URL
 - LinkedIn job post URL
@@ -15,39 +15,67 @@ To run:
 """
 import gradio as gr
-from functions.context_acquisition import get_linkedin_profile_html
-def process_inputs(linkedin_url, github_url, job_post_url):
     """
-    Process the input URLs and retrieve content from LinkedIn profile.
     Args:
-        linkedin_url (str): LinkedIn profile URL
         github_url (str): GitHub profile URL
         job_post_url (str): LinkedIn job post URL
     Returns:
-        str: Formatted output with URL information and LinkedIn profile status
     """
-    result = f"LinkedIn: {linkedin_url}\nGitHub: {github_url}\nJob Post: {job_post_url}\n\n"
-    # Try to retrieve LinkedIn profile HTML if URL is provided
-    if linkedin_url and linkedin_url.strip():
-        try:
-            result += "Attempting to retrieve LinkedIn profile...\n"
-            html_content = get_linkedin_profile_html(linkedin_url)
-            result += f"LinkedIn profile HTML ({len(html_content)} characters)\n"
-        except Exception as e: # pylint: disable=broad-exception-caught
-            result += f"❌ Failed to retrieve LinkedIn profile: {str(e)}\n"
     return result
 with gr.Blocks() as demo:
     gr.Markdown("# Resumate: Profile & Job Post Input")
-    linkedin_profile = gr.Textbox(
-        label="LinkedIn Profile URL",
-        placeholder="Enter your LinkedIn profile URL"
     )
     github_profile = gr.Textbox(
@@ -61,11 +89,11 @@ with gr.Blocks() as demo:
     )
     submit_btn = gr.Button("Submit")
-    output = gr.Textbox(label="Output", lines=3)
     submit_btn.click( # pylint: disable=no-member
         process_inputs,
-        inputs=[linkedin_profile, github_profile, job_post],
         outputs=output
     )

 """
 resumate.py
+A simple Gradio UI for collecting user profile and job post information.
+This app provides inputs for:
+- LinkedIn resume export PDF file upload
 - GitHub profile URL
 - LinkedIn job post URL
 """
 import gradio as gr
+from functions.context_acquisition import extract_text_from_linkedin_pdf, get_llm_context_from_resume
+def process_inputs(linkedin_pdf, github_url, job_post_url):
     """
+    Process the input files and URLs.
     Args:
+        linkedin_pdf: Uploaded LinkedIn resume export PDF file
         github_url (str): GitHub profile URL
         job_post_url (str): LinkedIn job post URL
     Returns:
+        str: Formatted output with file and URL information
     """
+    result = ""
+    # Process LinkedIn PDF file
+    if linkedin_pdf is not None:
+        result += f"✅ LinkedIn Resume PDF uploaded: {linkedin_pdf.name}\n"
+        # Extract and structure text from the PDF
+        extraction_result = extract_text_from_linkedin_pdf(linkedin_pdf.name)
+        if extraction_result["status"] == "success":
+            structured_text = extraction_result["structured_text"]
+            result += "✅ Text extraction successful\n"
+            result += structured_text["llm_formatted"] + "\n"
+        elif extraction_result["status"] == "warning":
+            result += f"⚠️  Text extraction: {extraction_result['message']}\n\n"
+        else:
+            result += f"❌ Text extraction failed: {extraction_result['message']}\n\n"
+    else:
+        result += "❌ No LinkedIn resume PDF file uploaded\n\n"
+    # Process other inputs
+    result += f"GitHub Profile: {github_url if github_url else 'Not provided'}\n"
+    result += f"Job Post URL: {job_post_url if job_post_url else 'Not provided'}\n"
     return result
 with gr.Blocks() as demo:
     gr.Markdown("# Resumate: Profile & Job Post Input")
+    gr.Markdown("""
+    ## How to Export Your LinkedIn Profile as PDF
+    1. **Go to your LinkedIn profile page** (linkedin.com/in/your-profile)
+    2. **Click "More" button** (three dots) in your profile header section
+    3. **Select "Save to PDF"** from the dropdown menu
+    4. **Wait for the download** - LinkedIn will generate and download your profile as a PDF file
+    5. **Upload the downloaded PDF** using the file upload box below
+    💡 **Tip**: Make sure your LinkedIn profile is complete and up-to-date before exporting for best results!
+    """)
+    linkedin_pdf = gr.File(
+        label="LinkedIn Resume Export PDF",
+        file_types=[".pdf"],
+        file_count="single"
     )
     github_profile = gr.Textbox(
     )
     submit_btn = gr.Button("Submit")
+    output = gr.Textbox(label="Output", lines=20, max_lines=50, show_copy_button=True)
     submit_btn.click( # pylint: disable=no-member
         process_inputs,
+        inputs=[linkedin_pdf, github_profile, job_post],
         outputs=output
     )

tests/test_context_acquisition.py CHANGED Viewed

@@ -1,252 +1,3 @@
 """
 Unit tests for the context_acquisition module.
 """
-import unittest
-import os
-import tempfile
-import shutil
-from selenium.webdriver.chrome.options import Options
-import functions.context_acquisition
-# Import the functions to test
-from functions.context_acquisition import (
-    _clean_html_content,
-    _save_html_to_file,
-    setup_chrome_driver_options
-)
-class TestCleanHTMLContent(unittest.TestCase):
-    """Test cases for the _clean_html_content function."""
-    def test_remove_blank_lines(self):
-        """Test removal of blank lines from HTML content."""
-        html_with_blanks = """<html>
-<head>
-    <title>Test</title>
-</head>
-<body>
-    <div>Content</div>
-</body>
-</html>"""
-        expected = """<html>
-<head>
-    <title>Test</title>
-</head>
-<body>
-    <div>Content</div>
-</body>
-</html>"""
-        result = _clean_html_content(html_with_blanks)
-        self.assertEqual(result, expected)
-    def test_strip_trailing_whitespace(self):
-        """Test removal of trailing whitespace from lines."""
-        html_with_trailing = "<div>Content</div>   \n<p>Text</p>\t\n"
-        expected = "<div>Content</div>\n<p>Text</p>"
-        result = _clean_html_content(html_with_trailing)
-        self.assertEqual(result, expected)
-    def test_empty_content(self):
-        """Test handling of empty or whitespace-only content."""
-        self.assertEqual(_clean_html_content(""), "")
-        self.assertEqual(_clean_html_content("   \n\n\t  "), "")
-        self.assertEqual(_clean_html_content("\n"), "")
-    def test_single_line_content(self):
-        """Test cleaning of single line content."""
-        single_line = "<html><body>Content</body></html>"
-        result = _clean_html_content(single_line)
-        self.assertEqual(result, single_line)
-    def test_mixed_whitespace(self):
-        """Test handling of mixed whitespace characters."""
-        mixed = "<div>\t\n  \n\r\n<p>Text</p>\n   \n</div>"
-        expected = "<div>\n<p>Text</p>\n</div>"
-        result = _clean_html_content(mixed)
-        self.assertEqual(result, expected)
-class TestSaveHTMLToFile(unittest.TestCase):
-    """Test cases for the _save_html_to_file function."""
-    def setUp(self):
-        """Set up test fixtures with temporary directory."""
-        self.test_dir = tempfile.mkdtemp()
-        self.test_html = "<html><body>Test content</body></html>"
-        self.test_url = "https://www.linkedin.com/in/johndoe"
-    def tearDown(self):
-        """Clean up temporary directory."""
-        if os.path.exists(self.test_dir):
-            shutil.rmtree(self.test_dir)
-    def test_successful_file_save(self):
-        """Test successful saving of HTML content to file."""
-        # Temporarily change the file path calculation
-        original_dirname = os.path.dirname
-        def mock_dirname(path):
-            if path.endswith('context_acquisition.py'):
-                return self.test_dir
-            return original_dirname(path)
-        # Replace os.path.dirname temporarily
-        original_func = functions.context_acquisition.os.path.dirname
-        functions.context_acquisition.os.path.dirname = mock_dirname
-        try:
-            result = _save_html_to_file(self.test_html, self.test_url)
-            # Verify file was created
-            self.assertTrue(os.path.exists(result))
-            self.assertTrue(result.endswith('.html'))
-            # Verify file content
-            with open(result, 'r', encoding='utf-8') as f:
-                content = f.read()
-                self.assertEqual(content, self.test_html)
-        finally:
-            # Restore original function
-            functions.context_acquisition.os.path.dirname = original_func
-class TestSetupChromeDriverOptions(unittest.TestCase):
-    """Test cases for the setup_chrome_driver_options function."""
-    def test_chrome_options_configuration(self):
-        """Test that Chrome options are properly configured."""
-        options = setup_chrome_driver_options()
-        # Verify that options object is returned
-        self.assertIsNotNone(options)
-        # Verify it's the correct type
-        self.assertIsInstance(options, Options)
-    def test_chrome_options_arguments(self):
-        """Test that required Chrome arguments are set."""
-        options = setup_chrome_driver_options()
-        # Access the arguments (this is implementation dependent)
-        # Note: This test verifies the function runs without error
-        # Specific argument verification would require accessing private attributes
-        self.assertIsNotNone(options)
-class TestURLValidation(unittest.TestCase):
-    """Test cases for URL validation logic (extracted from main function)."""
-    def test_valid_linkedin_urls(self):
-        """Test validation of valid LinkedIn URLs."""
-        valid_urls = [
-            "https://www.linkedin.com/in/johndoe",
-            "https://linkedin.com/in/jane-smith",
-            "http://www.linkedin.com/in/test123",
-            "https://www.linkedin.com/in/user-name-with-dashes",
-        ]
-        for url in valid_urls:
-            # Test the validation logic directly
-            self.assertTrue(isinstance(url, str))
-            self.assertTrue(url.strip())
-            self.assertIn("linkedin.com/in/", url)
-    def test_invalid_linkedin_urls(self):
-        """Test validation of invalid LinkedIn URLs."""
-        invalid_urls = [
-            "",
-            None,
-            "https://www.example.com/profile",
-            "https://www.linkedin.com/company/test",
-            "https://github.com/user",
-            "not-a-url",
-        ]
-        for url in invalid_urls:
-            # Test the validation logic directly
-            if url is None or not isinstance(url, str):
-                self.assertTrue(url is None or not isinstance(url, str))
-            elif not url.strip():
-                self.assertFalse(url.strip())
-            else:
-                self.assertNotIn("linkedin.com/in/", url)
-class TestHTMLContentProcessing(unittest.TestCase):
-    """Test cases for HTML content processing workflows."""
-    def test_html_cleaning_workflow(self):
-        """Test the complete HTML cleaning workflow."""
-        raw_html = """<!DOCTYPE html>
-<html>
-<head>
-    <title>LinkedIn Profile</title>
-</head>
-<body>
-    <div class="profile">
-        <h1>John Doe</h1>
-        <p>Software Engineer</p>
-    </div>
-</body>
-</html>"""
-        cleaned = _clean_html_content(raw_html)
-        # Verify no empty lines
-        lines = cleaned.split('\n')
-        for line in lines:
-            self.assertTrue(line.strip(), f"Found empty line: '{line}'")
-        # Verify content is preserved
-        self.assertIn("John Doe", cleaned)
-        self.assertIn("Software Engineer", cleaned)
-        self.assertIn("LinkedIn Profile", cleaned)
-    def test_minimal_html_cleaning(self):
-        """Test cleaning of minimal HTML content."""
-        minimal_html = "<html><body>Content</body></html>"
-        result = _clean_html_content(minimal_html)
-        self.assertEqual(result, minimal_html)
-    def test_complex_whitespace_patterns(self):
-        """Test cleaning of complex whitespace patterns."""
-        complex_html = """<div>
-\t\t
-    <span>Text</span>
-\t
-<p>Paragraph</p>
-   \t
-</div>"""
-        result = _clean_html_content(complex_html)
-        lines = result.split('\n')
-        # Should have no empty lines
-        for line in lines:
-            self.assertTrue(line.strip())
-        # Should preserve content
-        self.assertIn("Text", result)
-        self.assertIn("Paragraph", result)
-if __name__ == '__main__':
-    unittest.main()

 """
 Unit tests for the context_acquisition module.
 """