Spaces:
Configuration error
Configuration error
File size: 5,535 Bytes
f1fa456 36e38f5 b9464fb f70c1ff b9464fb bef6750 36e38f5 f70c1ff 10f94c1 f70c1ff 10f94c1 f70c1ff 36e38f5 bef6750 36e38f5 bef6750 36e38f5 bef6750 36e38f5 f70c1ff 36e38f5 f70c1ff b9464fb 36e38f5 b9464fb 36e38f5 b9464fb 36e38f5 b9464fb 36e38f5 f70c1ff b9464fb 36e38f5 b9464fb f9a80bc b9464fb 36e38f5 b9464fb f70c1ff 36e38f5 f70c1ff b9464fb bef6750 b9464fb f70c1ff b9464fb f9a80bc b9464fb f70c1ff b9464fb f9a80bc b9464fb f70c1ff 36e38f5 f70c1ff 36e38f5 f70c1ff bef6750 36e38f5 bef6750 36e38f5 f70c1ff b9464fb 36e38f5 f70c1ff 36e38f5 b9464fb 36e38f5 f70c1ff 36e38f5 b9464fb 36e38f5 b9464fb 36e38f5 b9464fb 36e38f5 b9464fb 36e38f5 b9464fb 36e38f5 b9464fb 36e38f5 b9464fb 36e38f5 b9464fb 36e38f5 b9464fb 36e38f5 b9464fb 36e38f5 b9464fb f70c1ff 36e38f5 f70c1ff b9464fb f70c1ff b9464fb f70c1ff 36e38f5 f70c1ff bef6750 36e38f5 f70c1ff bef6750 36e38f5 f70c1ff 36e38f5 b9464fb f70c1ff b9464fb f70c1ff b9464fb f70c1ff b9464fb f70c1ff |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 |
"""
context_acquisition.py
Functions for acquiring context from various sources including PDF text extraction,
GitHub profiles, and job posting text.
"""
import re
import logging
import io
import json
import unicodedata
from pathlib import Path
from datetime import datetime
import PyPDF2
from functions.helper import clean_text_whitespace
# pylint: disable=broad-exception-caught
def extract_text(pdf_file: str) -> dict:
"""
Extract and structure text content from an uploaded LinkedIn resume export PDF file
for optimal LLM processing.
Args:
pdf_file: The file path string to the uploaded PDF file
Returns:
dict: Dictionary containing extraction status, structured text content, and metadata
Example:
{
"contact_info": "...",
"summary": "...",
"skills": "...",
"experience": "...",
"education": "...",
"certifications": "...",
}
"""
logger = logging.getLogger(f'{__name__}.extract_text')
try:
# Read the PDF file from the file path
with open(pdf_file, 'rb') as file:
file_content = file.read()
# Create PDF reader from the file content
pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_content))
# Extract text from all pages
extracted_text = ""
num_pages = len(pdf_reader.pages)
logger.info("Extracting text from %d pages", num_pages)
for page_num in range(num_pages):
try:
page = pdf_reader.pages[page_num]
page_text = page.extract_text()
extracted_text += page_text + "\n\n"
except Exception as e:
logger.warning("Error extracting text from page %d: %s", page_num + 1, str(e))
continue
logger.info("Extracted text length: %d characters", len(extracted_text))
# Clean and structure the extracted text for LLM consumption
structured_content = _parse_resume_text(extracted_text)
if not structured_content:
return None
logger.info("Found sections: %s", list(structured_content.keys()))
# Save results to JSON file
try:
linkedin_profile_dir = Path(__file__).parent.parent / "data" / "linkedin_profile"
linkedin_profile_dir.mkdir(parents=True, exist_ok=True)
# Create timestamped filename
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_file = linkedin_profile_dir / f"linkedin_resume_{timestamp}.json"
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(structured_content, f, indent=2, ensure_ascii=False)
except Exception as save_error:
logger.warning("Failed to save LinkedIn resume extraction to file: %s", str(save_error))
return structured_content
except Exception as e:
logger.error("Error processing PDF file: %s", str(e))
return None
def _parse_resume_text(text: str) -> dict:
"""
Parse resume text into logical sections for optimal LLM processing.
Args:
text (str): Raw extracted text from PDF
Returns:
dict: Structured text with sections, full text, and summary
"""
if not text:
return None
# Define section patterns (common LinkedIn export sections)
section_patterns = {
"contact_info": r"(?i)(contact|personal|profile)\s*(?:information)?",
"summary": r"(?i)(summary|about|overview|profile)",
"skills": r"(?i)(skills|expertise|competencies|proficiencies)",
"experience": r"(?i)(experience|work|employment|professional)",
"education": r"(?i)(education|academic|university|college|school)",
"certifications": r"(?i)(certification|certificate|license)",
}
# Split text into lines for processing
lines = text.split('\n')
sections = {}
current_section = "general"
current_content = []
for line in lines:
line = line.strip()
if not line:
continue
# Check if line is a section header
section_found = None
for section_name, pattern in section_patterns.items():
if re.match(pattern, line):
section_found = section_name
break
if section_found:
# Save previous section content
if current_content:
sections[current_section] = '\n'.join(current_content)
# Start new section
current_section = section_found
current_content = [line]
else:
current_content.append(line)
# Save the last section
if current_content:
sections[current_section] = '\n'.join(current_content)
# Clean each section
for section_name, content in sections.items():
sections[section_name] = _clean_section(content)
return sections
def _clean_section(text: str) -> str:
"""
Clean a section of text by normalizing whitespace and removing unnecessary characters.
Args:
text (str): The text section to clean
Returns:
str: Cleaned text section
"""
# Normalize unicode characters to avoid issues with special characters
text = unicodedata.normalize('NFKC', text)
# Remove `Page n of n` added by linkedin export
text = re.sub(r'Page \d+ of \d+', '', text)
# Clean redundant whitespace
text = clean_text_whitespace(text)
return text.strip()
|