Spaces:

gperdrizet
/

resumate

Configuration error

File size: 10,113 Bytes

f1fa456
36e38f5

"""
context_acquisition.py

Functions for acquiring context from various sources including PDF text extraction,
GitHub profiles, and job posting text.
"""

import re
import logging
import io
import os
import PyPDF2

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


def extract_text_from_linkedin_pdf(pdf_file) -> dict:
    """
    Extract and structure text content from an uploaded LinkedIn resume export PDF file
    for optimal LLM processing.
    
    Args:
        pdf_file: The file path string to the uploaded PDF file
        
    Returns:
        dict: Dictionary containing extraction status, structured text content, and metadata
        
    Example:
        {
            "status": "success",
            "structured_text": {
                "sections": {...},
                "full_text": "...",
                "llm_formatted": "...",
                "summary": "..."
            },
            "metadata": {...}
        }
    """
    if pdf_file is None:
        return {"status": "error", "message": "No PDF file provided"}
    
    try:
        # Get filename from path
        filename = os.path.basename(pdf_file)
        
        # Read the PDF file from the file path
        with open(pdf_file, 'rb') as file:
            file_content = file.read()
            file_size = len(file_content)
        
        # Create PDF reader from the file content
        pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_content))
        
        # Extract text from all pages
        extracted_text = ""
        num_pages = len(pdf_reader.pages)
        
        for page_num in range(num_pages):
            try:
                page = pdf_reader.pages[page_num]
                page_text = page.extract_text()
                extracted_text += page_text + "\n\n"
            except Exception as e:
                logger.warning(f"Error extracting text from page {page_num + 1}: {str(e)}")
                continue
        
        # Clean and structure the extracted text for LLM consumption
        structured_content = _structure_resume_text(extracted_text)
        
        if not structured_content["full_text"].strip():
            return {
                "status": "warning",
                "structured_text": structured_content,
                "metadata": {
                    "filename": filename,
                    "file_size": file_size,
                    "pages": num_pages
                },
                "message": "PDF processed but no text content was extracted"
            }
        
        logger.info(f"Successfully extracted and structured {len(structured_content['full_text'])} characters from {filename}")
        
        return {
            "status": "success",
            "structured_text": structured_content,
            "metadata": {
                "filename": filename,
                "file_size": file_size,
                "pages": num_pages,
                "sections_found": list(structured_content["sections"].keys())
            },
            "message": f"Text extracted and structured successfully from {num_pages} pages"
        }
        
    except Exception as e:
        logger.error(f"Error processing PDF file: {str(e)}")
        return {
            "status": "error",
            "message": f"Failed to extract text from PDF: {str(e)}"
        }


def _structure_resume_text(text: str) -> dict:
    """
    Structure resume text into logical sections for optimal LLM processing.
    
    Args:
        text (str): Raw extracted text from PDF
        
    Returns:
        dict: Structured text with sections, full text, and summary
    """
    if not text:
        return {
            "sections": {},
            "full_text": "",
            "llm_formatted": "",
            "summary": "",
            "format": "structured_resume",
            "word_count": 0,
            "section_count": 0
        }
    
    # Clean the text first
    cleaned_text = _clean_extracted_text(text)
    
    # Define section patterns (common LinkedIn export sections)
    section_patterns = {
        "contact_info": r"(?i)(contact|personal|profile)\s*(?:information)?",
        "summary": r"(?i)(summary|about|overview|profile)",
        "experience": r"(?i)(experience|work|employment|professional)",
        "education": r"(?i)(education|academic|university|college|school)",
        "skills": r"(?i)(skills|competencies|technologies|technical)",
        "certifications": r"(?i)(certification|certificate|license)",
    }
    
    # Split text into lines for processing
    lines = cleaned_text.split('\n')
    sections = {}
    current_section = "general"
    current_content = []
    
    for line in lines:
        line = line.strip()
        if not line:
            continue
            
        # Check if line is a section header
        section_found = None
        for section_name, pattern in section_patterns.items():
            if re.match(pattern, line):
                section_found = section_name
                break
        
        if section_found:
            # Save previous section content
            if current_content:
                sections[current_section] = '\n'.join(current_content)
            
            # Start new section
            current_section = section_found
            current_content = [line]
        else:
            current_content.append(line)
    
    # Save the last section
    if current_content:
        sections[current_section] = '\n'.join(current_content)
    
    # Create a structured summary for LLM context
    summary_parts = []
    if "contact_info" in sections:
        summary_parts.append(f"CONTACT: {sections['contact_info'][:200]}...")
    if "summary" in sections:
        summary_parts.append(f"SUMMARY: {sections['summary']}")
    if "experience" in sections:
        summary_parts.append(f"EXPERIENCE: {sections['experience'][:300]}...")
    if "education" in sections:
        summary_parts.append(f"EDUCATION: {sections['education']}")
    if "skills" in sections:
        summary_parts.append(f"SKILLS: {sections['skills']}")
    
    # Create LLM-optimized format
    llm_formatted_text = _format_for_llm(sections, cleaned_text)
    
    return {
        "sections": sections,
        "full_text": cleaned_text,
        "llm_formatted": llm_formatted_text,
        "summary": '\n\n'.join(summary_parts),
        "format": "structured_resume",
        "word_count": len(cleaned_text.split()),
        "section_count": len(sections)
    }


def _format_for_llm(sections: dict, full_text: str) -> str:
    """
    Format the resume sections in an optimal way for LLM processing.
    
    Args:
        sections (dict): Structured sections
        full_text (str): Full cleaned text
        
    Returns:
        str: LLM-optimized formatted text
    """
    formatted_parts = ["=== RESUME CONTENT ===\n"]
    
    # Prioritize sections in logical order for LLM
    priority_order = ["summary", "contact_info", "experience", "education", "skills", 
                     "certifications", "projects", "achievements", "languages", "volunteer"]
    
    # Add prioritized sections
    for section_name in priority_order:
        if section_name in sections:
            formatted_parts.append(f"[{section_name.upper().replace('_', ' ')}]")
            formatted_parts.append(sections[section_name])
            formatted_parts.append("")  # Empty line between sections
    
    # Add any remaining sections
    for section_name, content in sections.items():
        if section_name not in priority_order and section_name != "general":
            formatted_parts.append(f"[{section_name.upper().replace('_', ' ')}]")
            formatted_parts.append(content)
            formatted_parts.append("")
    
    # Add general content if exists
    if "general" in sections:
        formatted_parts.append("[ADDITIONAL INFORMATION]")
        formatted_parts.append(sections["general"])
    
    formatted_parts.append("\n=== END RESUME ===")
    
    return '\n'.join(formatted_parts)


def _clean_extracted_text(text: str) -> str:
    """
    Clean and normalize extracted text from PDF for better LLM processing.
    
    Args:
        text (str): Raw extracted text
        
    Returns:
        str: Cleaned text optimized for LLM consumption
    """
    if not text:
        return ""
    
    # Remove excessive whitespace and normalize line endings
    text = re.sub(r'\r\n', '\n', text)
    text = re.sub(r'\r', '\n', text)
    
    # Split into lines and clean each line
    lines = text.split('\n')
    cleaned_lines = []
    
    for line in lines:
        # Strip whitespace
        cleaned_line = line.strip()
        
        # Skip empty lines and very short lines (likely artifacts)
        if len(cleaned_line) < 2:
            continue
            
        # Remove common PDF artifacts
        cleaned_line = re.sub(r'^\d+$', '', cleaned_line)  # Page numbers
        cleaned_line = re.sub(r'^[|\-_=]+$', '', cleaned_line)  # Separator lines
        
        if cleaned_line:
            cleaned_lines.append(cleaned_line)
    
    # Join lines and normalize spacing
    cleaned_text = '\n'.join(cleaned_lines)
    
    # Normalize multiple spaces to single spaces
    cleaned_text = re.sub(r' +', ' ', cleaned_text)
    
    # Normalize multiple newlines to maximum of 2
    cleaned_text = re.sub(r'\n{3,}', '\n\n', cleaned_text)
    
    return cleaned_text.strip()


def get_llm_context_from_resume(extraction_result: dict) -> str:
    """
    Extract the best formatted text for LLM context from the extraction result.
    
    Args:
        extraction_result (dict): Result from extract_text_from_linkedin_pdf
        
    Returns:
        str: Formatted text ready for LLM context
    """
    if extraction_result.get("status") != "success":
        return ""
    
    structured_text = extraction_result.get("structured_text", {})
    
    # Return the LLM-formatted version if available, otherwise fall back to full text
    return structured_text.get("llm_formatted", structured_text.get("full_text", ""))