Spaces:

gperdrizet
/

resumate

Configuration error

App Files Files Community

resumate / functions /linkedin_resume.py

gperdrizet

Cleaned up, added some instruction text to the UI and renamed the PDF resume parsing module.

f5b66ec 3 months ago

raw

history blame

10.1 kB

	"""
	context_acquisition.py

	Functions for acquiring context from various sources including PDF text extraction,
	GitHub profiles, and job posting text.
	"""

	import re
	import logging
	import io
	import os
	import PyPDF2

	# Set up logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)


	def extract_text_from_linkedin_pdf(pdf_file) -> dict:
	"""
	Extract and structure text content from an uploaded LinkedIn resume export PDF file
	for optimal LLM processing.

	Args:
	pdf_file: The file path string to the uploaded PDF file

	Returns:
	dict: Dictionary containing extraction status, structured text content, and metadata

	Example:
	{
	"status": "success",
	"structured_text": {
	"sections": {...},
	"full_text": "...",
	"llm_formatted": "...",
	"summary": "..."
	},
	"metadata": {...}
	}
	"""
	if pdf_file is None:
	return {"status": "error", "message": "No PDF file provided"}

	try:
	# Get filename from path
	filename = os.path.basename(pdf_file)

	# Read the PDF file from the file path
	with open(pdf_file, 'rb') as file:
	file_content = file.read()
	file_size = len(file_content)

	# Create PDF reader from the file content
	pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_content))

	# Extract text from all pages
	extracted_text = ""
	num_pages = len(pdf_reader.pages)

	for page_num in range(num_pages):
	try:
	page = pdf_reader.pages[page_num]
	page_text = page.extract_text()
	extracted_text += page_text + "\n\n"
	except Exception as e:
	logger.warning(f"Error extracting text from page {page_num + 1}: {str(e)}")
	continue

	# Clean and structure the extracted text for LLM consumption
	structured_content = _structure_resume_text(extracted_text)

	if not structured_content["full_text"].strip():
	return {
	"status": "warning",
	"structured_text": structured_content,
	"metadata": {
	"filename": filename,
	"file_size": file_size,
	"pages": num_pages
	},
	"message": "PDF processed but no text content was extracted"
	}

	logger.info(f"Successfully extracted and structured {len(structured_content['full_text'])} characters from {filename}")

	return {
	"status": "success",
	"structured_text": structured_content,
	"metadata": {
	"filename": filename,
	"file_size": file_size,
	"pages": num_pages,
	"sections_found": list(structured_content["sections"].keys())
	},
	"message": f"Text extracted and structured successfully from {num_pages} pages"
	}

	except Exception as e:
	logger.error(f"Error processing PDF file: {str(e)}")
	return {
	"status": "error",
	"message": f"Failed to extract text from PDF: {str(e)}"
	}


	def _structure_resume_text(text: str) -> dict:
	"""
	Structure resume text into logical sections for optimal LLM processing.

	Args:
	text (str): Raw extracted text from PDF

	Returns:
	dict: Structured text with sections, full text, and summary
	"""
	if not text:
	return {
	"sections": {},
	"full_text": "",
	"llm_formatted": "",
	"summary": "",
	"format": "structured_resume",
	"word_count": 0,
	"section_count": 0
	}

	# Clean the text first
	cleaned_text = _clean_extracted_text(text)

	# Define section patterns (common LinkedIn export sections)
	section_patterns = {
	"contact_info": r"(?i)(contact\|personal\|profile)\s*(?:information)?",
	"summary": r"(?i)(summary\|about\|overview\|profile)",
	"experience": r"(?i)(experience\|work\|employment\|professional)",
	"education": r"(?i)(education\|academic\|university\|college\|school)",
	"skills": r"(?i)(skills\|competencies\|technologies\|technical)",
	"certifications": r"(?i)(certification\|certificate\|license)",
	}

	# Split text into lines for processing
	lines = cleaned_text.split('\n')
	sections = {}
	current_section = "general"
	current_content = []

	for line in lines:
	line = line.strip()
	if not line:
	continue

	# Check if line is a section header
	section_found = None
	for section_name, pattern in section_patterns.items():
	if re.match(pattern, line):
	section_found = section_name
	break

	if section_found:
	# Save previous section content
	if current_content:
	sections[current_section] = '\n'.join(current_content)

	# Start new section
	current_section = section_found
	current_content = [line]
	else:
	current_content.append(line)

	# Save the last section
	if current_content:
	sections[current_section] = '\n'.join(current_content)

	# Create a structured summary for LLM context
	summary_parts = []
	if "contact_info" in sections:
	summary_parts.append(f"CONTACT: {sections['contact_info'][:200]}...")
	if "summary" in sections:
	summary_parts.append(f"SUMMARY: {sections['summary']}")
	if "experience" in sections:
	summary_parts.append(f"EXPERIENCE: {sections['experience'][:300]}...")
	if "education" in sections:
	summary_parts.append(f"EDUCATION: {sections['education']}")
	if "skills" in sections:
	summary_parts.append(f"SKILLS: {sections['skills']}")

	# Create LLM-optimized format
	llm_formatted_text = _format_for_llm(sections, cleaned_text)

	return {
	"sections": sections,
	"full_text": cleaned_text,
	"llm_formatted": llm_formatted_text,
	"summary": '\n\n'.join(summary_parts),
	"format": "structured_resume",
	"word_count": len(cleaned_text.split()),
	"section_count": len(sections)
	}


	def _format_for_llm(sections: dict, full_text: str) -> str:
	"""
	Format the resume sections in an optimal way for LLM processing.

	Args:
	sections (dict): Structured sections
	full_text (str): Full cleaned text

	Returns:
	str: LLM-optimized formatted text
	"""
	formatted_parts = ["=== RESUME CONTENT ===\n"]

	# Prioritize sections in logical order for LLM
	priority_order = ["summary", "contact_info", "experience", "education", "skills",
	"certifications", "projects", "achievements", "languages", "volunteer"]

	# Add prioritized sections
	for section_name in priority_order:
	if section_name in sections:
	formatted_parts.append(f"[{section_name.upper().replace('_', ' ')}]")
	formatted_parts.append(sections[section_name])
	formatted_parts.append("") # Empty line between sections

	# Add any remaining sections
	for section_name, content in sections.items():
	if section_name not in priority_order and section_name != "general":
	formatted_parts.append(f"[{section_name.upper().replace('_', ' ')}]")
	formatted_parts.append(content)
	formatted_parts.append("")

	# Add general content if exists
	if "general" in sections:
	formatted_parts.append("[ADDITIONAL INFORMATION]")
	formatted_parts.append(sections["general"])

	formatted_parts.append("\n=== END RESUME ===")

	return '\n'.join(formatted_parts)


	def _clean_extracted_text(text: str) -> str:
	"""
	Clean and normalize extracted text from PDF for better LLM processing.

	Args:
	text (str): Raw extracted text

	Returns:
	str: Cleaned text optimized for LLM consumption
	"""
	if not text:
	return ""

	# Remove excessive whitespace and normalize line endings
	text = re.sub(r'\r\n', '\n', text)
	text = re.sub(r'\r', '\n', text)

	# Split into lines and clean each line
	lines = text.split('\n')
	cleaned_lines = []

	for line in lines:
	# Strip whitespace
	cleaned_line = line.strip()

	# Skip empty lines and very short lines (likely artifacts)
	if len(cleaned_line) < 2:
	continue

	# Remove common PDF artifacts
	cleaned_line = re.sub(r'^\d+$', '', cleaned_line) # Page numbers
	cleaned_line = re.sub(r'^[\|\-_=]+$', '', cleaned_line) # Separator lines

	if cleaned_line:
	cleaned_lines.append(cleaned_line)

	# Join lines and normalize spacing
	cleaned_text = '\n'.join(cleaned_lines)

	# Normalize multiple spaces to single spaces
	cleaned_text = re.sub(r' +', ' ', cleaned_text)

	# Normalize multiple newlines to maximum of 2
	cleaned_text = re.sub(r'\n{3,}', '\n\n', cleaned_text)

	return cleaned_text.strip()


	def get_llm_context_from_resume(extraction_result: dict) -> str:
	"""
	Extract the best formatted text for LLM context from the extraction result.

	Args:
	extraction_result (dict): Result from extract_text_from_linkedin_pdf

	Returns:
	str: Formatted text ready for LLM context
	"""
	if extraction_result.get("status") != "success":
	return ""

	structured_text = extraction_result.get("structured_text", {})

	# Return the LLM-formatted version if available, otherwise fall back to full text
	return structured_text.get("llm_formatted", structured_text.get("full_text", ""))