Spaces:
Configuration error
Configuration error
""" | |
context_acquisition.py | |
Functions for acquiring context from various sources including PDF text extraction, | |
GitHub profiles, and job posting text. | |
""" | |
import re | |
import logging | |
import io | |
import os | |
import json | |
from pathlib import Path | |
from datetime import datetime | |
import PyPDF2 | |
# pylint: disable=broad-exception-caught | |
# Set up logging | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
def check_default_linkedin_pdf(): | |
"""Check if default LinkedIn PDF exists in data directory.""" | |
# Get the project root directory (parent of functions directory) | |
project_root = Path(__file__).parent.parent | |
default_pdf = f'{project_root}/data/linkedin_profile.pdf' | |
if not Path(default_pdf).exists(): | |
logger.warning("Default LinkedIn PDF not found at %s", default_pdf) | |
return False, None | |
return True, default_pdf | |
def extract_text_from_linkedin_pdf(pdf_file) -> dict: | |
""" | |
Extract and structure text content from an uploaded LinkedIn resume export PDF file | |
for optimal LLM processing. | |
Args: | |
pdf_file: The file path string to the uploaded PDF file | |
Returns: | |
dict: Dictionary containing extraction status, structured text content, and metadata | |
Example: | |
{ | |
"status": "success", | |
"structured_text": { | |
"sections": {...}, | |
"full_text": "...", | |
"llm_formatted": "...", | |
"summary": "..." | |
}, | |
"metadata": {...} | |
} | |
""" | |
if pdf_file is None: | |
return {"status": "error", "message": "No PDF file provided"} | |
try: | |
# Get filename from path | |
filename = os.path.basename(pdf_file) | |
# Read the PDF file from the file path | |
with open(pdf_file, 'rb') as file: | |
file_content = file.read() | |
file_size = len(file_content) | |
# Create PDF reader from the file content | |
pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_content)) | |
# Extract text from all pages | |
extracted_text = "" | |
num_pages = len(pdf_reader.pages) | |
for page_num in range(num_pages): | |
try: | |
page = pdf_reader.pages[page_num] | |
page_text = page.extract_text() | |
extracted_text += page_text + "\n\n" | |
except Exception as e: | |
logger.warning("Error extracting text from page %d: %s", page_num + 1, str(e)) | |
continue | |
# Clean and structure the extracted text for LLM consumption | |
structured_content = _structure_resume_text(extracted_text) | |
if not structured_content["full_text"].strip(): | |
return { | |
"status": "warning", | |
"structured_text": structured_content, | |
"metadata": { | |
"filename": filename, | |
"file_size": file_size, | |
"pages": num_pages | |
}, | |
"message": "PDF processed but no text content was extracted" | |
} | |
logger.info( | |
"Successfully extracted and structured %d characters from %s", | |
len(structured_content['full_text']), | |
filename | |
) | |
result = { | |
"status": "success", | |
"structured_text": structured_content, | |
"metadata": { | |
"filename": filename, | |
"file_size": file_size, | |
"pages": num_pages, | |
"sections_found": list(structured_content["sections"].keys()) | |
}, | |
"message": f"Text extracted and structured successfully from {num_pages} pages" | |
} | |
# Save results to JSON file | |
try: | |
linkedin_profile_dir = Path(__file__).parent.parent / "data" / "linkedin_profile" | |
linkedin_profile_dir.mkdir(parents=True, exist_ok=True) | |
# Create timestamped filename | |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
output_file = linkedin_profile_dir / f"linkedin_resume_{timestamp}.json" | |
with open(output_file, 'w', encoding='utf-8') as f: | |
json.dump(result, f, indent=2, ensure_ascii=False) | |
logger.info("LinkedIn resume extraction saved to %s", output_file) | |
except Exception as save_error: | |
logger.warning("Failed to save LinkedIn resume extraction to file: %s", str(save_error)) | |
return result | |
except Exception as e: | |
logger.error("Error processing PDF file: %s", str(e)) | |
return { | |
"status": "error", | |
"message": f"Failed to extract text from PDF: {str(e)}" | |
} | |
def _structure_resume_text(text: str) -> dict: | |
""" | |
Structure resume text into logical sections for optimal LLM processing. | |
Args: | |
text (str): Raw extracted text from PDF | |
Returns: | |
dict: Structured text with sections, full text, and summary | |
""" | |
if not text: | |
return { | |
"sections": {}, | |
"full_text": "", | |
"llm_formatted": "", | |
"summary": "", | |
"format": "structured_resume", | |
"word_count": 0, | |
"section_count": 0 | |
} | |
# Clean the text first | |
cleaned_text = _clean_extracted_text(text) | |
# Define section patterns (common LinkedIn export sections) | |
section_patterns = { | |
"contact_info": r"(?i)(contact|personal|profile)\s*(?:information)?", | |
"summary": r"(?i)(summary|about|overview|profile)", | |
"experience": r"(?i)(experience|work|employment|professional)", | |
"education": r"(?i)(education|academic|university|college|school)", | |
"skills": r"(?i)(skills|competencies|technologies|technical)", | |
"certifications": r"(?i)(certification|certificate|license)", | |
} | |
# Split text into lines for processing | |
lines = cleaned_text.split('\n') | |
sections = {} | |
current_section = "general" | |
current_content = [] | |
for line in lines: | |
line = line.strip() | |
if not line: | |
continue | |
# Check if line is a section header | |
section_found = None | |
for section_name, pattern in section_patterns.items(): | |
if re.match(pattern, line): | |
section_found = section_name | |
break | |
if section_found: | |
# Save previous section content | |
if current_content: | |
sections[current_section] = '\n'.join(current_content) | |
# Start new section | |
current_section = section_found | |
current_content = [line] | |
else: | |
current_content.append(line) | |
# Save the last section | |
if current_content: | |
sections[current_section] = '\n'.join(current_content) | |
# Create a structured summary for LLM context | |
summary_parts = [] | |
if "contact_info" in sections: | |
summary_parts.append(f"CONTACT: {sections['contact_info'][:200]}...") | |
if "summary" in sections: | |
summary_parts.append(f"SUMMARY: {sections['summary']}") | |
if "experience" in sections: | |
summary_parts.append(f"EXPERIENCE: {sections['experience'][:300]}...") | |
if "education" in sections: | |
summary_parts.append(f"EDUCATION: {sections['education']}") | |
if "skills" in sections: | |
summary_parts.append(f"SKILLS: {sections['skills']}") | |
# Create LLM-optimized format | |
llm_formatted_text = _format_for_llm(sections) | |
return { | |
"sections": sections, | |
"full_text": cleaned_text, | |
"llm_formatted": llm_formatted_text, | |
"summary": '\n\n'.join(summary_parts), | |
"format": "structured_resume", | |
"word_count": len(cleaned_text.split()), | |
"section_count": len(sections) | |
} | |
def _format_for_llm(sections: dict) -> str: | |
""" | |
Format the resume sections in an optimal way for LLM processing. | |
Args: | |
sections (dict): Structured sections | |
full_text (str): Full cleaned text | |
Returns: | |
str: LLM-optimized formatted text | |
""" | |
formatted_parts = ["=== RESUME CONTENT ===\n"] | |
# Prioritize sections in logical order for LLM | |
priority_order = ["summary", "contact_info", "experience", "education", "skills", | |
"certifications", "projects", "achievements", "languages", "volunteer"] | |
# Add prioritized sections | |
for section_name in priority_order: | |
if section_name in sections: | |
formatted_parts.append(f"[{section_name.upper().replace('_', ' ')}]") | |
formatted_parts.append(sections[section_name]) | |
formatted_parts.append("") # Empty line between sections | |
# Add any remaining sections | |
for section_name, content in sections.items(): | |
if section_name not in priority_order and section_name != "general": | |
formatted_parts.append(f"[{section_name.upper().replace('_', ' ')}]") | |
formatted_parts.append(content) | |
formatted_parts.append("") | |
# Add general content if exists | |
if "general" in sections: | |
formatted_parts.append("[ADDITIONAL INFORMATION]") | |
formatted_parts.append(sections["general"]) | |
formatted_parts.append("\n=== END RESUME ===") | |
return '\n'.join(formatted_parts) | |
def _clean_extracted_text(text: str) -> str: | |
""" | |
Clean and normalize extracted text from PDF for better LLM processing. | |
Args: | |
text (str): Raw extracted text | |
Returns: | |
str: Cleaned text optimized for LLM consumption | |
""" | |
if not text: | |
return "" | |
# Remove excessive whitespace and normalize line endings | |
text = re.sub(r'\r\n', '\n', text) | |
text = re.sub(r'\r', '\n', text) | |
# Split into lines and clean each line | |
lines = text.split('\n') | |
cleaned_lines = [] | |
for line in lines: | |
# Strip whitespace | |
cleaned_line = line.strip() | |
# Skip empty lines and very short lines (likely artifacts) | |
if len(cleaned_line) < 2: | |
continue | |
# Remove common PDF artifacts | |
cleaned_line = re.sub(r'^\d+$', '', cleaned_line) # Page numbers | |
cleaned_line = re.sub(r'^[|\-_=]+$', '', cleaned_line) # Separator lines | |
if cleaned_line: | |
cleaned_lines.append(cleaned_line) | |
# Join lines and normalize spacing | |
cleaned_text = '\n'.join(cleaned_lines) | |
# Normalize multiple spaces to single spaces | |
cleaned_text = re.sub(r' +', ' ', cleaned_text) | |
# Normalize multiple newlines to maximum of 2 | |
cleaned_text = re.sub(r'\n{3,}', '\n\n', cleaned_text) | |
return cleaned_text.strip() | |
def get_llm_context_from_resume(extraction_result: dict) -> str: | |
""" | |
Extract the best formatted text for LLM context from the extraction result. | |
Args: | |
extraction_result (dict): Result from extract_text_from_linkedin_pdf | |
Returns: | |
str: Formatted text ready for LLM context | |
""" | |
if extraction_result.get("status") != "success": | |
return "" | |
structured_text = extraction_result.get("structured_text", {}) | |
# Return the LLM-formatted version if available, otherwise fall back to full text | |
return structured_text.get("llm_formatted", structured_text.get("full_text", "")) | |