""" context_acquisition.py Functions for acquiring context from various sources including PDF text extraction, GitHub profiles, and job posting text. """ import re import logging import io import os import PyPDF2 # Set up logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) def extract_text_from_linkedin_pdf(pdf_file) -> dict: """ Extract and structure text content from an uploaded LinkedIn resume export PDF file for optimal LLM processing. Args: pdf_file: The file path string to the uploaded PDF file Returns: dict: Dictionary containing extraction status, structured text content, and metadata Example: { "status": "success", "structured_text": { "sections": {...}, "full_text": "...", "llm_formatted": "...", "summary": "..." }, "metadata": {...} } """ if pdf_file is None: return {"status": "error", "message": "No PDF file provided"} try: # Get filename from path filename = os.path.basename(pdf_file) # Read the PDF file from the file path with open(pdf_file, 'rb') as file: file_content = file.read() file_size = len(file_content) # Create PDF reader from the file content pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_content)) # Extract text from all pages extracted_text = "" num_pages = len(pdf_reader.pages) for page_num in range(num_pages): try: page = pdf_reader.pages[page_num] page_text = page.extract_text() extracted_text += page_text + "\n\n" except Exception as e: logger.warning(f"Error extracting text from page {page_num + 1}: {str(e)}") continue # Clean and structure the extracted text for LLM consumption structured_content = _structure_resume_text(extracted_text) if not structured_content["full_text"].strip(): return { "status": "warning", "structured_text": structured_content, "metadata": { "filename": filename, "file_size": file_size, "pages": num_pages }, "message": "PDF processed but no text content was extracted" } logger.info(f"Successfully extracted and structured {len(structured_content['full_text'])} characters from {filename}") return { "status": "success", "structured_text": structured_content, "metadata": { "filename": filename, "file_size": file_size, "pages": num_pages, "sections_found": list(structured_content["sections"].keys()) }, "message": f"Text extracted and structured successfully from {num_pages} pages" } except Exception as e: logger.error(f"Error processing PDF file: {str(e)}") return { "status": "error", "message": f"Failed to extract text from PDF: {str(e)}" } def _structure_resume_text(text: str) -> dict: """ Structure resume text into logical sections for optimal LLM processing. Args: text (str): Raw extracted text from PDF Returns: dict: Structured text with sections, full text, and summary """ if not text: return { "sections": {}, "full_text": "", "llm_formatted": "", "summary": "", "format": "structured_resume", "word_count": 0, "section_count": 0 } # Clean the text first cleaned_text = _clean_extracted_text(text) # Define section patterns (common LinkedIn export sections) section_patterns = { "contact_info": r"(?i)(contact|personal|profile)\s*(?:information)?", "summary": r"(?i)(summary|about|overview|profile)", "experience": r"(?i)(experience|work|employment|professional)", "education": r"(?i)(education|academic|university|college|school)", "skills": r"(?i)(skills|competencies|technologies|technical)", "certifications": r"(?i)(certification|certificate|license)", } # Split text into lines for processing lines = cleaned_text.split('\n') sections = {} current_section = "general" current_content = [] for line in lines: line = line.strip() if not line: continue # Check if line is a section header section_found = None for section_name, pattern in section_patterns.items(): if re.match(pattern, line): section_found = section_name break if section_found: # Save previous section content if current_content: sections[current_section] = '\n'.join(current_content) # Start new section current_section = section_found current_content = [line] else: current_content.append(line) # Save the last section if current_content: sections[current_section] = '\n'.join(current_content) # Create a structured summary for LLM context summary_parts = [] if "contact_info" in sections: summary_parts.append(f"CONTACT: {sections['contact_info'][:200]}...") if "summary" in sections: summary_parts.append(f"SUMMARY: {sections['summary']}") if "experience" in sections: summary_parts.append(f"EXPERIENCE: {sections['experience'][:300]}...") if "education" in sections: summary_parts.append(f"EDUCATION: {sections['education']}") if "skills" in sections: summary_parts.append(f"SKILLS: {sections['skills']}") # Create LLM-optimized format llm_formatted_text = _format_for_llm(sections, cleaned_text) return { "sections": sections, "full_text": cleaned_text, "llm_formatted": llm_formatted_text, "summary": '\n\n'.join(summary_parts), "format": "structured_resume", "word_count": len(cleaned_text.split()), "section_count": len(sections) } def _format_for_llm(sections: dict, full_text: str) -> str: """ Format the resume sections in an optimal way for LLM processing. Args: sections (dict): Structured sections full_text (str): Full cleaned text Returns: str: LLM-optimized formatted text """ formatted_parts = ["=== RESUME CONTENT ===\n"] # Prioritize sections in logical order for LLM priority_order = ["summary", "contact_info", "experience", "education", "skills", "certifications", "projects", "achievements", "languages", "volunteer"] # Add prioritized sections for section_name in priority_order: if section_name in sections: formatted_parts.append(f"[{section_name.upper().replace('_', ' ')}]") formatted_parts.append(sections[section_name]) formatted_parts.append("") # Empty line between sections # Add any remaining sections for section_name, content in sections.items(): if section_name not in priority_order and section_name != "general": formatted_parts.append(f"[{section_name.upper().replace('_', ' ')}]") formatted_parts.append(content) formatted_parts.append("") # Add general content if exists if "general" in sections: formatted_parts.append("[ADDITIONAL INFORMATION]") formatted_parts.append(sections["general"]) formatted_parts.append("\n=== END RESUME ===") return '\n'.join(formatted_parts) def _clean_extracted_text(text: str) -> str: """ Clean and normalize extracted text from PDF for better LLM processing. Args: text (str): Raw extracted text Returns: str: Cleaned text optimized for LLM consumption """ if not text: return "" # Remove excessive whitespace and normalize line endings text = re.sub(r'\r\n', '\n', text) text = re.sub(r'\r', '\n', text) # Split into lines and clean each line lines = text.split('\n') cleaned_lines = [] for line in lines: # Strip whitespace cleaned_line = line.strip() # Skip empty lines and very short lines (likely artifacts) if len(cleaned_line) < 2: continue # Remove common PDF artifacts cleaned_line = re.sub(r'^\d+$', '', cleaned_line) # Page numbers cleaned_line = re.sub(r'^[|\-_=]+$', '', cleaned_line) # Separator lines if cleaned_line: cleaned_lines.append(cleaned_line) # Join lines and normalize spacing cleaned_text = '\n'.join(cleaned_lines) # Normalize multiple spaces to single spaces cleaned_text = re.sub(r' +', ' ', cleaned_text) # Normalize multiple newlines to maximum of 2 cleaned_text = re.sub(r'\n{3,}', '\n\n', cleaned_text) return cleaned_text.strip() def get_llm_context_from_resume(extraction_result: dict) -> str: """ Extract the best formatted text for LLM context from the extraction result. Args: extraction_result (dict): Result from extract_text_from_linkedin_pdf Returns: str: Formatted text ready for LLM context """ if extraction_result.get("status") != "success": return "" structured_text = extraction_result.get("structured_text", {}) # Return the LLM-formatted version if available, otherwise fall back to full text return structured_text.get("llm_formatted", structured_text.get("full_text", ""))