Spaces:
Configuration error
Configuration error
Cleaned up LinkedIn resume PDF text extraction and parsing
Browse files- functions/gradio.py +53 -60
- functions/job_call.py +1 -1
- functions/linkedin_resume.py +43 -217
functions/gradio.py
CHANGED
@@ -7,8 +7,8 @@ Functions for handling Gradio UI interactions and processing user inputs.
|
|
7 |
import logging
|
8 |
from pathlib import Path
|
9 |
from functions.helper import clean_text_whitespace
|
10 |
-
from functions.linkedin_resume import
|
11 |
-
from functions.github import get_github_repositories
|
12 |
# from functions.job_call import summarize_job_call
|
13 |
# from functions.writer_agent import write_resume
|
14 |
|
@@ -60,33 +60,26 @@ def process_inputs(
|
|
60 |
logger.info("User instructions: %s", user_instructions[:100] if user_instructions else "None")
|
61 |
result = ""
|
62 |
|
63 |
-
#
|
64 |
-
|
|
|
65 |
|
66 |
-
|
67 |
-
|
68 |
-
# file_path = linkedin_pdf.name
|
69 |
-
# file_display_name = Path(file_path).name
|
70 |
|
71 |
-
|
72 |
-
|
73 |
|
74 |
-
#
|
75 |
-
#
|
|
|
76 |
|
77 |
-
#
|
78 |
-
#
|
79 |
-
#
|
80 |
-
|
81 |
-
# elif extraction_result["status"] == "warning":
|
82 |
-
# result += f" ⚠️ Text extraction: {extraction_result['message']}\n\n"
|
83 |
-
# logger.warning("LinkedIn PDF extraction warning: %s", extraction_result['message'])
|
84 |
-
# else:
|
85 |
-
# result += f" ❌ Text extraction failed: {extraction_result['message']}\n\n"
|
86 |
-
# logger.error("LinkedIn PDF extraction failed: %s", extraction_result['message'])
|
87 |
# else:
|
88 |
-
# result += "❌
|
89 |
-
# logger.
|
90 |
|
91 |
# # Process GitHub profile
|
92 |
# if github_url and github_url.strip():
|
@@ -153,50 +146,50 @@ def process_inputs(
|
|
153 |
return result
|
154 |
|
155 |
|
156 |
-
def get_processed_data(linkedin_pdf, github_url, job_post_text, instructions):
|
157 |
-
|
158 |
-
|
159 |
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
|
170 |
-
|
171 |
-
|
172 |
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
|
186 |
-
|
187 |
-
|
188 |
|
189 |
-
|
190 |
-
|
191 |
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
|
196 |
-
|
197 |
-
|
198 |
|
199 |
-
|
200 |
-
|
201 |
|
202 |
-
|
|
|
7 |
import logging
|
8 |
from pathlib import Path
|
9 |
from functions.helper import clean_text_whitespace
|
10 |
+
from functions.linkedin_resume import extract_text
|
11 |
+
# from functions.github import get_github_repositories
|
12 |
# from functions.job_call import summarize_job_call
|
13 |
# from functions.writer_agent import write_resume
|
14 |
|
|
|
60 |
logger.info("User instructions: %s", user_instructions[:100] if user_instructions else "None")
|
61 |
result = ""
|
62 |
|
63 |
+
# Extract and structure text from the linkedin profile PDF
|
64 |
+
logger.info("Extracting text from LinkedIn PDF: %s", linkedin_pdf_path)
|
65 |
+
extraction_result = extract_text(linkedin_pdf_path)
|
66 |
|
67 |
+
if extraction_result:
|
68 |
+
logger.info("LinkedIn PDF text extraction successful")
|
|
|
|
|
69 |
|
70 |
+
else:
|
71 |
+
logger.error("LinkedIn PDF text extraction failed")
|
72 |
|
73 |
+
# if extraction_result["status"] == "success":
|
74 |
+
# result += " ✅ Text extraction successful\n\n"
|
75 |
+
# logger.info("LinkedIn PDF text extraction successful")
|
76 |
|
77 |
+
# elif extraction_result["status"] == "warning":
|
78 |
+
# result += f" ⚠️ Text extraction: {extraction_result['message']}\n\n"
|
79 |
+
# logger.warning("LinkedIn PDF extraction warning: %s", extraction_result['message'])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
# else:
|
81 |
+
# result += f" ❌ Text extraction failed: {extraction_result['message']}\n\n"
|
82 |
+
# logger.error("LinkedIn PDF extraction failed: %s", extraction_result['message'])
|
83 |
|
84 |
# # Process GitHub profile
|
85 |
# if github_url and github_url.strip():
|
|
|
146 |
return result
|
147 |
|
148 |
|
149 |
+
# def get_processed_data(linkedin_pdf, github_url, job_post_text, instructions):
|
150 |
+
# """
|
151 |
+
# Get structured data from all inputs for further processing.
|
152 |
|
153 |
+
# Args:
|
154 |
+
# linkedin_pdf: Uploaded LinkedIn resume export PDF file
|
155 |
+
# github_url (str): GitHub profile URL
|
156 |
+
# job_post_text (str): Job post text content
|
157 |
+
# instructions (str): Additional instructions from the user
|
158 |
|
159 |
+
# Returns:
|
160 |
+
# dict: Structured data containing all processed information
|
161 |
+
# """
|
162 |
|
163 |
+
# job_post_text = job_post_text.strip() if job_post_text and job_post_text.strip() else None
|
164 |
+
# instructions = instructions.strip() if instructions and instructions.strip() else None
|
165 |
|
166 |
+
# processed_data = {
|
167 |
+
# "linkedin": None,
|
168 |
+
# "github": None,
|
169 |
+
# "job_post": job_post_text,
|
170 |
+
# "user_instructions": instructions,
|
171 |
+
# "errors": []
|
172 |
+
# }
|
173 |
|
174 |
+
# # Process LinkedIn PDF
|
175 |
+
# if linkedin_pdf is not None:
|
176 |
+
# file_path = linkedin_pdf.name
|
177 |
+
# extraction_result = extract_text_from_linkedin_pdf(file_path)
|
178 |
|
179 |
+
# if extraction_result["status"] == "success":
|
180 |
+
# processed_data["linkedin"] = extraction_result
|
181 |
|
182 |
+
# else:
|
183 |
+
# processed_data["errors"].append(f"LinkedIn: {extraction_result['message']}")
|
184 |
|
185 |
+
# # Process GitHub profile
|
186 |
+
# if github_url and github_url.strip():
|
187 |
+
# github_result = get_github_repositories(github_url)
|
188 |
|
189 |
+
# if github_result["status"] == "success":
|
190 |
+
# processed_data["github"] = github_result
|
191 |
|
192 |
+
# else:
|
193 |
+
# processed_data["errors"].append(f"GitHub: {github_result['message']}")
|
194 |
|
195 |
+
# return processed_data
|
functions/job_call.py
CHANGED
@@ -64,7 +64,7 @@ def summarize_job_call(job_call: str) -> str:
|
|
64 |
|
65 |
if not job_call or not job_call.strip():
|
66 |
logger.warning("No job call text provided for summarization")
|
67 |
-
|
68 |
return None
|
69 |
|
70 |
logger.info("Summarizing job call (%d characters)", len(job_call))
|
|
|
64 |
|
65 |
if not job_call or not job_call.strip():
|
66 |
logger.warning("No job call text provided for summarization")
|
67 |
+
|
68 |
return None
|
69 |
|
70 |
logger.info("Summarizing job call (%d characters)", len(job_call))
|
functions/linkedin_resume.py
CHANGED
@@ -8,35 +8,18 @@ GitHub profiles, and job posting text.
|
|
8 |
import re
|
9 |
import logging
|
10 |
import io
|
11 |
-
import os
|
12 |
import json
|
|
|
13 |
from pathlib import Path
|
14 |
from datetime import datetime
|
15 |
import PyPDF2
|
16 |
|
17 |
-
|
18 |
-
|
19 |
-
# Set up logging
|
20 |
-
logging.basicConfig(level=logging.INFO)
|
21 |
-
logger = logging.getLogger(__name__)
|
22 |
-
|
23 |
-
|
24 |
-
def check_default_linkedin_pdf():
|
25 |
-
"""Check if default LinkedIn PDF exists in data directory."""
|
26 |
|
27 |
-
|
28 |
-
project_root = Path(__file__).parent.parent
|
29 |
-
default_pdf = f'{project_root}/data/linkedin_profile.pdf'
|
30 |
-
|
31 |
-
if not Path(default_pdf).exists():
|
32 |
-
logger.warning("Default LinkedIn PDF not found at %s", default_pdf)
|
33 |
-
|
34 |
-
return False, None
|
35 |
-
|
36 |
-
return True, default_pdf
|
37 |
|
38 |
|
39 |
-
def
|
40 |
"""
|
41 |
Extract and structure text content from an uploaded LinkedIn resume export PDF file
|
42 |
for optimal LLM processing.
|
@@ -49,27 +32,22 @@ def extract_text_from_linkedin_pdf(pdf_file) -> dict:
|
|
49 |
|
50 |
Example:
|
51 |
{
|
52 |
-
"
|
53 |
-
"
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
},
|
59 |
-
"metadata": {...}
|
60 |
}
|
61 |
"""
|
62 |
-
|
63 |
-
|
64 |
|
65 |
try:
|
66 |
-
# Get filename from path
|
67 |
-
filename = os.path.basename(pdf_file)
|
68 |
|
69 |
# Read the PDF file from the file path
|
70 |
with open(pdf_file, 'rb') as file:
|
71 |
file_content = file.read()
|
72 |
-
file_size = len(file_content)
|
73 |
|
74 |
# Create PDF reader from the file content
|
75 |
pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_content))
|
@@ -77,6 +55,7 @@ def extract_text_from_linkedin_pdf(pdf_file) -> dict:
|
|
77 |
# Extract text from all pages
|
78 |
extracted_text = ""
|
79 |
num_pages = len(pdf_reader.pages)
|
|
|
80 |
|
81 |
for page_num in range(num_pages):
|
82 |
try:
|
@@ -89,38 +68,15 @@ def extract_text_from_linkedin_pdf(pdf_file) -> dict:
|
|
89 |
|
90 |
continue
|
91 |
|
|
|
|
|
92 |
# Clean and structure the extracted text for LLM consumption
|
93 |
-
structured_content =
|
94 |
-
|
95 |
-
if not structured_content
|
96 |
-
return
|
97 |
-
|
98 |
-
|
99 |
-
"metadata": {
|
100 |
-
"filename": filename,
|
101 |
-
"file_size": file_size,
|
102 |
-
"pages": num_pages
|
103 |
-
},
|
104 |
-
"message": "PDF processed but no text content was extracted"
|
105 |
-
}
|
106 |
-
|
107 |
-
logger.info(
|
108 |
-
"Successfully extracted and structured %d characters from %s",
|
109 |
-
len(structured_content['full_text']),
|
110 |
-
filename
|
111 |
-
)
|
112 |
-
|
113 |
-
result = {
|
114 |
-
"status": "success",
|
115 |
-
"structured_text": structured_content,
|
116 |
-
"metadata": {
|
117 |
-
"filename": filename,
|
118 |
-
"file_size": file_size,
|
119 |
-
"pages": num_pages,
|
120 |
-
"sections_found": list(structured_content["sections"].keys())
|
121 |
-
},
|
122 |
-
"message": f"Text extracted and structured successfully from {num_pages} pages"
|
123 |
-
}
|
124 |
|
125 |
# Save results to JSON file
|
126 |
try:
|
@@ -132,27 +88,22 @@ def extract_text_from_linkedin_pdf(pdf_file) -> dict:
|
|
132 |
output_file = linkedin_profile_dir / f"linkedin_resume_{timestamp}.json"
|
133 |
|
134 |
with open(output_file, 'w', encoding='utf-8') as f:
|
135 |
-
json.dump(
|
136 |
-
|
137 |
-
logger.info("LinkedIn resume extraction saved to %s", output_file)
|
138 |
|
139 |
except Exception as save_error:
|
140 |
logger.warning("Failed to save LinkedIn resume extraction to file: %s", str(save_error))
|
141 |
|
142 |
-
return
|
143 |
|
144 |
except Exception as e:
|
145 |
logger.error("Error processing PDF file: %s", str(e))
|
146 |
|
147 |
-
return
|
148 |
-
"status": "error",
|
149 |
-
"message": f"Failed to extract text from PDF: {str(e)}"
|
150 |
-
}
|
151 |
|
152 |
|
153 |
-
def
|
154 |
"""
|
155 |
-
|
156 |
|
157 |
Args:
|
158 |
text (str): Raw extracted text from PDF
|
@@ -161,31 +112,20 @@ def _structure_resume_text(text: str) -> dict:
|
|
161 |
dict: Structured text with sections, full text, and summary
|
162 |
"""
|
163 |
if not text:
|
164 |
-
return
|
165 |
-
"sections": {},
|
166 |
-
"full_text": "",
|
167 |
-
"llm_formatted": "",
|
168 |
-
"summary": "",
|
169 |
-
"format": "structured_resume",
|
170 |
-
"word_count": 0,
|
171 |
-
"section_count": 0
|
172 |
-
}
|
173 |
-
|
174 |
-
# Clean the text first
|
175 |
-
cleaned_text = _clean_extracted_text(text)
|
176 |
|
177 |
# Define section patterns (common LinkedIn export sections)
|
178 |
section_patterns = {
|
179 |
"contact_info": r"(?i)(contact|personal|profile)\s*(?:information)?",
|
180 |
"summary": r"(?i)(summary|about|overview|profile)",
|
|
|
181 |
"experience": r"(?i)(experience|work|employment|professional)",
|
182 |
"education": r"(?i)(education|academic|university|college|school)",
|
183 |
-
"skills": r"(?i)(skills|competencies|technologies|technical)",
|
184 |
"certifications": r"(?i)(certification|certificate|license)",
|
185 |
}
|
186 |
|
187 |
# Split text into lines for processing
|
188 |
-
lines =
|
189 |
sections = {}
|
190 |
current_section = "general"
|
191 |
current_content = []
|
@@ -222,145 +162,31 @@ def _structure_resume_text(text: str) -> dict:
|
|
222 |
if current_content:
|
223 |
sections[current_section] = '\n'.join(current_content)
|
224 |
|
225 |
-
#
|
226 |
-
summary_parts = []
|
227 |
-
|
228 |
-
if "contact_info" in sections:
|
229 |
-
summary_parts.append(f"CONTACT: {sections['contact_info'][:200]}...")
|
230 |
-
|
231 |
-
if "summary" in sections:
|
232 |
-
summary_parts.append(f"SUMMARY: {sections['summary']}")
|
233 |
-
|
234 |
-
if "experience" in sections:
|
235 |
-
summary_parts.append(f"EXPERIENCE: {sections['experience'][:300]}...")
|
236 |
-
|
237 |
-
if "education" in sections:
|
238 |
-
summary_parts.append(f"EDUCATION: {sections['education']}")
|
239 |
-
|
240 |
-
if "skills" in sections:
|
241 |
-
summary_parts.append(f"SKILLS: {sections['skills']}")
|
242 |
-
|
243 |
-
# Create LLM-optimized format
|
244 |
-
llm_formatted_text = _format_for_llm(sections)
|
245 |
-
|
246 |
-
return {
|
247 |
-
"sections": sections,
|
248 |
-
"full_text": cleaned_text,
|
249 |
-
"llm_formatted": llm_formatted_text,
|
250 |
-
"summary": '\n\n'.join(summary_parts),
|
251 |
-
"format": "structured_resume",
|
252 |
-
"word_count": len(cleaned_text.split()),
|
253 |
-
"section_count": len(sections)
|
254 |
-
}
|
255 |
-
|
256 |
-
|
257 |
-
def _format_for_llm(sections: dict) -> str:
|
258 |
-
"""
|
259 |
-
Format the resume sections in an optimal way for LLM processing.
|
260 |
-
|
261 |
-
Args:
|
262 |
-
sections (dict): Structured sections
|
263 |
-
full_text (str): Full cleaned text
|
264 |
-
|
265 |
-
Returns:
|
266 |
-
str: LLM-optimized formatted text
|
267 |
-
"""
|
268 |
-
formatted_parts = ["=== RESUME CONTENT ===\n"]
|
269 |
-
|
270 |
-
# Prioritize sections in logical order for LLM
|
271 |
-
priority_order = ["summary", "contact_info", "experience", "education", "skills",
|
272 |
-
"certifications", "projects", "achievements", "languages", "volunteer"]
|
273 |
-
|
274 |
-
# Add prioritized sections
|
275 |
-
for section_name in priority_order:
|
276 |
-
if section_name in sections:
|
277 |
-
|
278 |
-
formatted_parts.append(f"[{section_name.upper().replace('_', ' ')}]")
|
279 |
-
formatted_parts.append(sections[section_name])
|
280 |
-
formatted_parts.append("") # Empty line between sections
|
281 |
-
|
282 |
-
# Add any remaining sections
|
283 |
for section_name, content in sections.items():
|
284 |
-
|
285 |
-
|
286 |
-
formatted_parts.append(f"[{section_name.upper().replace('_', ' ')}]")
|
287 |
-
formatted_parts.append(content)
|
288 |
-
formatted_parts.append("")
|
289 |
-
|
290 |
-
# Add general content if exists
|
291 |
-
if "general" in sections:
|
292 |
|
293 |
-
|
294 |
-
formatted_parts.append(sections["general"])
|
295 |
|
296 |
-
formatted_parts.append("\n=== END RESUME ===")
|
297 |
|
298 |
-
|
299 |
-
|
300 |
-
|
301 |
-
def _clean_extracted_text(text: str) -> str:
|
302 |
"""
|
303 |
-
Clean
|
304 |
|
305 |
Args:
|
306 |
-
text (str):
|
307 |
|
308 |
Returns:
|
309 |
-
str: Cleaned text
|
310 |
"""
|
311 |
-
if not text:
|
312 |
-
return ""
|
313 |
-
|
314 |
-
# Remove excessive whitespace and normalize line endings
|
315 |
-
text = re.sub(r'\r\n', '\n', text)
|
316 |
-
text = re.sub(r'\r', '\n', text)
|
317 |
-
|
318 |
-
# Split into lines and clean each line
|
319 |
-
lines = text.split('\n')
|
320 |
-
cleaned_lines = []
|
321 |
-
|
322 |
-
for line in lines:
|
323 |
|
324 |
-
|
325 |
-
|
326 |
|
327 |
-
|
328 |
-
|
329 |
-
continue
|
330 |
-
|
331 |
-
# Remove common PDF artifacts
|
332 |
-
cleaned_line = re.sub(r'^\d+$', '', cleaned_line) # Page numbers
|
333 |
-
cleaned_line = re.sub(r'^[|\-_=]+$', '', cleaned_line) # Separator lines
|
334 |
-
|
335 |
-
if cleaned_line:
|
336 |
-
cleaned_lines.append(cleaned_line)
|
337 |
-
|
338 |
-
# Join lines and normalize spacing
|
339 |
-
cleaned_text = '\n'.join(cleaned_lines)
|
340 |
-
|
341 |
-
# Normalize multiple spaces to single spaces
|
342 |
-
cleaned_text = re.sub(r' +', ' ', cleaned_text)
|
343 |
-
|
344 |
-
# Normalize multiple newlines to maximum of 2
|
345 |
-
cleaned_text = re.sub(r'\n{3,}', '\n\n', cleaned_text)
|
346 |
-
|
347 |
-
return cleaned_text.strip()
|
348 |
-
|
349 |
-
|
350 |
-
def get_llm_context_from_resume(extraction_result: dict) -> str:
|
351 |
-
"""
|
352 |
-
Extract the best formatted text for LLM context from the extraction result.
|
353 |
-
|
354 |
-
Args:
|
355 |
-
extraction_result (dict): Result from extract_text_from_linkedin_pdf
|
356 |
-
|
357 |
-
Returns:
|
358 |
-
str: Formatted text ready for LLM context
|
359 |
-
"""
|
360 |
-
if extraction_result.get("status") != "success":
|
361 |
-
return ""
|
362 |
|
363 |
-
|
|
|
364 |
|
365 |
-
|
366 |
-
return structured_text.get("llm_formatted", structured_text.get("full_text", ""))
|
|
|
8 |
import re
|
9 |
import logging
|
10 |
import io
|
|
|
11 |
import json
|
12 |
+
import unicodedata
|
13 |
from pathlib import Path
|
14 |
from datetime import datetime
|
15 |
import PyPDF2
|
16 |
|
17 |
+
from functions.helper import clean_text_whitespace
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
|
19 |
+
# pylint: disable=broad-exception-caught
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
|
22 |
+
def extract_text(pdf_file: str) -> dict:
|
23 |
"""
|
24 |
Extract and structure text content from an uploaded LinkedIn resume export PDF file
|
25 |
for optimal LLM processing.
|
|
|
32 |
|
33 |
Example:
|
34 |
{
|
35 |
+
"contact_info": "...",
|
36 |
+
"summary": "...",
|
37 |
+
"skills": "...",
|
38 |
+
"experience": "...",
|
39 |
+
"education": "...",
|
40 |
+
"certifications": "...",
|
|
|
|
|
41 |
}
|
42 |
"""
|
43 |
+
|
44 |
+
logger = logging.getLogger(f'{__name__}.extract_text')
|
45 |
|
46 |
try:
|
|
|
|
|
47 |
|
48 |
# Read the PDF file from the file path
|
49 |
with open(pdf_file, 'rb') as file:
|
50 |
file_content = file.read()
|
|
|
51 |
|
52 |
# Create PDF reader from the file content
|
53 |
pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_content))
|
|
|
55 |
# Extract text from all pages
|
56 |
extracted_text = ""
|
57 |
num_pages = len(pdf_reader.pages)
|
58 |
+
logger.info("Extracting text from %d pages", num_pages)
|
59 |
|
60 |
for page_num in range(num_pages):
|
61 |
try:
|
|
|
68 |
|
69 |
continue
|
70 |
|
71 |
+
logger.info("Extracted text length: %d characters", len(extracted_text))
|
72 |
+
|
73 |
# Clean and structure the extracted text for LLM consumption
|
74 |
+
structured_content = _parse_resume_text(extracted_text)
|
75 |
+
|
76 |
+
if not structured_content:
|
77 |
+
return None
|
78 |
+
|
79 |
+
logger.info("Found sections: %s", list(structured_content.keys()))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
|
81 |
# Save results to JSON file
|
82 |
try:
|
|
|
88 |
output_file = linkedin_profile_dir / f"linkedin_resume_{timestamp}.json"
|
89 |
|
90 |
with open(output_file, 'w', encoding='utf-8') as f:
|
91 |
+
json.dump(structured_content, f, indent=2, ensure_ascii=False)
|
|
|
|
|
92 |
|
93 |
except Exception as save_error:
|
94 |
logger.warning("Failed to save LinkedIn resume extraction to file: %s", str(save_error))
|
95 |
|
96 |
+
return structured_content
|
97 |
|
98 |
except Exception as e:
|
99 |
logger.error("Error processing PDF file: %s", str(e))
|
100 |
|
101 |
+
return None
|
|
|
|
|
|
|
102 |
|
103 |
|
104 |
+
def _parse_resume_text(text: str) -> dict:
|
105 |
"""
|
106 |
+
Parse resume text into logical sections for optimal LLM processing.
|
107 |
|
108 |
Args:
|
109 |
text (str): Raw extracted text from PDF
|
|
|
112 |
dict: Structured text with sections, full text, and summary
|
113 |
"""
|
114 |
if not text:
|
115 |
+
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
116 |
|
117 |
# Define section patterns (common LinkedIn export sections)
|
118 |
section_patterns = {
|
119 |
"contact_info": r"(?i)(contact|personal|profile)\s*(?:information)?",
|
120 |
"summary": r"(?i)(summary|about|overview|profile)",
|
121 |
+
"skills": r"(?i)(skills|expertise|competencies|proficiencies)",
|
122 |
"experience": r"(?i)(experience|work|employment|professional)",
|
123 |
"education": r"(?i)(education|academic|university|college|school)",
|
|
|
124 |
"certifications": r"(?i)(certification|certificate|license)",
|
125 |
}
|
126 |
|
127 |
# Split text into lines for processing
|
128 |
+
lines = text.split('\n')
|
129 |
sections = {}
|
130 |
current_section = "general"
|
131 |
current_content = []
|
|
|
162 |
if current_content:
|
163 |
sections[current_section] = '\n'.join(current_content)
|
164 |
|
165 |
+
# Clean each section
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
166 |
for section_name, content in sections.items():
|
167 |
+
sections[section_name] = _clean_section(content)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
168 |
|
169 |
+
return sections
|
|
|
170 |
|
|
|
171 |
|
172 |
+
def _clean_section(text: str) -> str:
|
|
|
|
|
|
|
173 |
"""
|
174 |
+
Clean a section of text by normalizing whitespace and removing unnecessary characters.
|
175 |
|
176 |
Args:
|
177 |
+
text (str): The text section to clean
|
178 |
|
179 |
Returns:
|
180 |
+
str: Cleaned text section
|
181 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
182 |
|
183 |
+
# Normalize unicode characters to avoid issues with special characters
|
184 |
+
text = unicodedata.normalize('NFKC', text)
|
185 |
|
186 |
+
# Remove `Page n of n` added by linkedin export
|
187 |
+
text = re.sub(r'Page \d+ of \d+', '', text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
188 |
|
189 |
+
# Clean redundant whitespace
|
190 |
+
text = clean_text_whitespace(text)
|
191 |
|
192 |
+
return text.strip()
|
|