resumate / functions /linkedin_resume.py
gperdrizet's picture
Cleaned up, added some instruction text to the UI and renamed the PDF resume parsing module.
f5b66ec
raw
history blame
10.1 kB
"""
context_acquisition.py
Functions for acquiring context from various sources including PDF text extraction,
GitHub profiles, and job posting text.
"""
import re
import logging
import io
import os
import PyPDF2
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def extract_text_from_linkedin_pdf(pdf_file) -> dict:
"""
Extract and structure text content from an uploaded LinkedIn resume export PDF file
for optimal LLM processing.
Args:
pdf_file: The file path string to the uploaded PDF file
Returns:
dict: Dictionary containing extraction status, structured text content, and metadata
Example:
{
"status": "success",
"structured_text": {
"sections": {...},
"full_text": "...",
"llm_formatted": "...",
"summary": "..."
},
"metadata": {...}
}
"""
if pdf_file is None:
return {"status": "error", "message": "No PDF file provided"}
try:
# Get filename from path
filename = os.path.basename(pdf_file)
# Read the PDF file from the file path
with open(pdf_file, 'rb') as file:
file_content = file.read()
file_size = len(file_content)
# Create PDF reader from the file content
pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_content))
# Extract text from all pages
extracted_text = ""
num_pages = len(pdf_reader.pages)
for page_num in range(num_pages):
try:
page = pdf_reader.pages[page_num]
page_text = page.extract_text()
extracted_text += page_text + "\n\n"
except Exception as e:
logger.warning(f"Error extracting text from page {page_num + 1}: {str(e)}")
continue
# Clean and structure the extracted text for LLM consumption
structured_content = _structure_resume_text(extracted_text)
if not structured_content["full_text"].strip():
return {
"status": "warning",
"structured_text": structured_content,
"metadata": {
"filename": filename,
"file_size": file_size,
"pages": num_pages
},
"message": "PDF processed but no text content was extracted"
}
logger.info(f"Successfully extracted and structured {len(structured_content['full_text'])} characters from {filename}")
return {
"status": "success",
"structured_text": structured_content,
"metadata": {
"filename": filename,
"file_size": file_size,
"pages": num_pages,
"sections_found": list(structured_content["sections"].keys())
},
"message": f"Text extracted and structured successfully from {num_pages} pages"
}
except Exception as e:
logger.error(f"Error processing PDF file: {str(e)}")
return {
"status": "error",
"message": f"Failed to extract text from PDF: {str(e)}"
}
def _structure_resume_text(text: str) -> dict:
"""
Structure resume text into logical sections for optimal LLM processing.
Args:
text (str): Raw extracted text from PDF
Returns:
dict: Structured text with sections, full text, and summary
"""
if not text:
return {
"sections": {},
"full_text": "",
"llm_formatted": "",
"summary": "",
"format": "structured_resume",
"word_count": 0,
"section_count": 0
}
# Clean the text first
cleaned_text = _clean_extracted_text(text)
# Define section patterns (common LinkedIn export sections)
section_patterns = {
"contact_info": r"(?i)(contact|personal|profile)\s*(?:information)?",
"summary": r"(?i)(summary|about|overview|profile)",
"experience": r"(?i)(experience|work|employment|professional)",
"education": r"(?i)(education|academic|university|college|school)",
"skills": r"(?i)(skills|competencies|technologies|technical)",
"certifications": r"(?i)(certification|certificate|license)",
}
# Split text into lines for processing
lines = cleaned_text.split('\n')
sections = {}
current_section = "general"
current_content = []
for line in lines:
line = line.strip()
if not line:
continue
# Check if line is a section header
section_found = None
for section_name, pattern in section_patterns.items():
if re.match(pattern, line):
section_found = section_name
break
if section_found:
# Save previous section content
if current_content:
sections[current_section] = '\n'.join(current_content)
# Start new section
current_section = section_found
current_content = [line]
else:
current_content.append(line)
# Save the last section
if current_content:
sections[current_section] = '\n'.join(current_content)
# Create a structured summary for LLM context
summary_parts = []
if "contact_info" in sections:
summary_parts.append(f"CONTACT: {sections['contact_info'][:200]}...")
if "summary" in sections:
summary_parts.append(f"SUMMARY: {sections['summary']}")
if "experience" in sections:
summary_parts.append(f"EXPERIENCE: {sections['experience'][:300]}...")
if "education" in sections:
summary_parts.append(f"EDUCATION: {sections['education']}")
if "skills" in sections:
summary_parts.append(f"SKILLS: {sections['skills']}")
# Create LLM-optimized format
llm_formatted_text = _format_for_llm(sections, cleaned_text)
return {
"sections": sections,
"full_text": cleaned_text,
"llm_formatted": llm_formatted_text,
"summary": '\n\n'.join(summary_parts),
"format": "structured_resume",
"word_count": len(cleaned_text.split()),
"section_count": len(sections)
}
def _format_for_llm(sections: dict, full_text: str) -> str:
"""
Format the resume sections in an optimal way for LLM processing.
Args:
sections (dict): Structured sections
full_text (str): Full cleaned text
Returns:
str: LLM-optimized formatted text
"""
formatted_parts = ["=== RESUME CONTENT ===\n"]
# Prioritize sections in logical order for LLM
priority_order = ["summary", "contact_info", "experience", "education", "skills",
"certifications", "projects", "achievements", "languages", "volunteer"]
# Add prioritized sections
for section_name in priority_order:
if section_name in sections:
formatted_parts.append(f"[{section_name.upper().replace('_', ' ')}]")
formatted_parts.append(sections[section_name])
formatted_parts.append("") # Empty line between sections
# Add any remaining sections
for section_name, content in sections.items():
if section_name not in priority_order and section_name != "general":
formatted_parts.append(f"[{section_name.upper().replace('_', ' ')}]")
formatted_parts.append(content)
formatted_parts.append("")
# Add general content if exists
if "general" in sections:
formatted_parts.append("[ADDITIONAL INFORMATION]")
formatted_parts.append(sections["general"])
formatted_parts.append("\n=== END RESUME ===")
return '\n'.join(formatted_parts)
def _clean_extracted_text(text: str) -> str:
"""
Clean and normalize extracted text from PDF for better LLM processing.
Args:
text (str): Raw extracted text
Returns:
str: Cleaned text optimized for LLM consumption
"""
if not text:
return ""
# Remove excessive whitespace and normalize line endings
text = re.sub(r'\r\n', '\n', text)
text = re.sub(r'\r', '\n', text)
# Split into lines and clean each line
lines = text.split('\n')
cleaned_lines = []
for line in lines:
# Strip whitespace
cleaned_line = line.strip()
# Skip empty lines and very short lines (likely artifacts)
if len(cleaned_line) < 2:
continue
# Remove common PDF artifacts
cleaned_line = re.sub(r'^\d+$', '', cleaned_line) # Page numbers
cleaned_line = re.sub(r'^[|\-_=]+$', '', cleaned_line) # Separator lines
if cleaned_line:
cleaned_lines.append(cleaned_line)
# Join lines and normalize spacing
cleaned_text = '\n'.join(cleaned_lines)
# Normalize multiple spaces to single spaces
cleaned_text = re.sub(r' +', ' ', cleaned_text)
# Normalize multiple newlines to maximum of 2
cleaned_text = re.sub(r'\n{3,}', '\n\n', cleaned_text)
return cleaned_text.strip()
def get_llm_context_from_resume(extraction_result: dict) -> str:
"""
Extract the best formatted text for LLM context from the extraction result.
Args:
extraction_result (dict): Result from extract_text_from_linkedin_pdf
Returns:
str: Formatted text ready for LLM context
"""
if extraction_result.get("status") != "success":
return ""
structured_text = extraction_result.get("structured_text", {})
# Return the LLM-formatted version if available, otherwise fall back to full text
return structured_text.get("llm_formatted", structured_text.get("full_text", ""))