gperdrizet commited on
Commit
f70c1ff
·
verified ·
1 Parent(s): e55b547

Cleaned up LinkedIn resume PDF text extraction and parsing

Browse files
functions/gradio.py CHANGED
@@ -7,8 +7,8 @@ Functions for handling Gradio UI interactions and processing user inputs.
7
  import logging
8
  from pathlib import Path
9
  from functions.helper import clean_text_whitespace
10
- from functions.linkedin_resume import extract_text_from_linkedin_pdf
11
- from functions.github import get_github_repositories
12
  # from functions.job_call import summarize_job_call
13
  # from functions.writer_agent import write_resume
14
 
@@ -60,33 +60,26 @@ def process_inputs(
60
  logger.info("User instructions: %s", user_instructions[:100] if user_instructions else "None")
61
  result = ""
62
 
63
- # extraction_result = None
64
- # logger.info("Processing user inputs from Gradio interface")
 
65
 
66
- # # Process LinkedIn PDF file
67
- # if linkedin_pdf is not None:
68
- # file_path = linkedin_pdf.name
69
- # file_display_name = Path(file_path).name
70
 
71
- # result += "✅ LinkedIn Resume PDF provided\n"
72
- # logger.info("Processing LinkedIn PDF: %s", file_display_name)
73
 
74
- # # Extract and structure text from the PDF
75
- # extraction_result = extract_text_from_linkedin_pdf(file_path)
 
76
 
77
- # if extraction_result["status"] == "success":
78
- # result += " Text extraction successful\n\n"
79
- # logger.info("LinkedIn PDF text extraction successful")
80
-
81
- # elif extraction_result["status"] == "warning":
82
- # result += f" ⚠️ Text extraction: {extraction_result['message']}\n\n"
83
- # logger.warning("LinkedIn PDF extraction warning: %s", extraction_result['message'])
84
- # else:
85
- # result += f" ❌ Text extraction failed: {extraction_result['message']}\n\n"
86
- # logger.error("LinkedIn PDF extraction failed: %s", extraction_result['message'])
87
  # else:
88
- # result += "❌ No LinkedIn resume PDF file uploaded\n\n"
89
- # logger.info("No LinkedIn PDF file provided")
90
 
91
  # # Process GitHub profile
92
  # if github_url and github_url.strip():
@@ -153,50 +146,50 @@ def process_inputs(
153
  return result
154
 
155
 
156
- def get_processed_data(linkedin_pdf, github_url, job_post_text, instructions):
157
- """
158
- Get structured data from all inputs for further processing.
159
 
160
- Args:
161
- linkedin_pdf: Uploaded LinkedIn resume export PDF file
162
- github_url (str): GitHub profile URL
163
- job_post_text (str): Job post text content
164
- instructions (str): Additional instructions from the user
165
 
166
- Returns:
167
- dict: Structured data containing all processed information
168
- """
169
 
170
- job_post_text = job_post_text.strip() if job_post_text and job_post_text.strip() else None
171
- instructions = instructions.strip() if instructions and instructions.strip() else None
172
 
173
- processed_data = {
174
- "linkedin": None,
175
- "github": None,
176
- "job_post": job_post_text,
177
- "user_instructions": instructions,
178
- "errors": []
179
- }
180
 
181
- # Process LinkedIn PDF
182
- if linkedin_pdf is not None:
183
- file_path = linkedin_pdf.name
184
- extraction_result = extract_text_from_linkedin_pdf(file_path)
185
 
186
- if extraction_result["status"] == "success":
187
- processed_data["linkedin"] = extraction_result
188
 
189
- else:
190
- processed_data["errors"].append(f"LinkedIn: {extraction_result['message']}")
191
 
192
- # Process GitHub profile
193
- if github_url and github_url.strip():
194
- github_result = get_github_repositories(github_url)
195
 
196
- if github_result["status"] == "success":
197
- processed_data["github"] = github_result
198
 
199
- else:
200
- processed_data["errors"].append(f"GitHub: {github_result['message']}")
201
 
202
- return processed_data
 
7
  import logging
8
  from pathlib import Path
9
  from functions.helper import clean_text_whitespace
10
+ from functions.linkedin_resume import extract_text
11
+ # from functions.github import get_github_repositories
12
  # from functions.job_call import summarize_job_call
13
  # from functions.writer_agent import write_resume
14
 
 
60
  logger.info("User instructions: %s", user_instructions[:100] if user_instructions else "None")
61
  result = ""
62
 
63
+ # Extract and structure text from the linkedin profile PDF
64
+ logger.info("Extracting text from LinkedIn PDF: %s", linkedin_pdf_path)
65
+ extraction_result = extract_text(linkedin_pdf_path)
66
 
67
+ if extraction_result:
68
+ logger.info("LinkedIn PDF text extraction successful")
 
 
69
 
70
+ else:
71
+ logger.error("LinkedIn PDF text extraction failed")
72
 
73
+ # if extraction_result["status"] == "success":
74
+ # result += " ✅ Text extraction successful\n\n"
75
+ # logger.info("LinkedIn PDF text extraction successful")
76
 
77
+ # elif extraction_result["status"] == "warning":
78
+ # result += f" ⚠️ Text extraction: {extraction_result['message']}\n\n"
79
+ # logger.warning("LinkedIn PDF extraction warning: %s", extraction_result['message'])
 
 
 
 
 
 
 
80
  # else:
81
+ # result += f" Text extraction failed: {extraction_result['message']}\n\n"
82
+ # logger.error("LinkedIn PDF extraction failed: %s", extraction_result['message'])
83
 
84
  # # Process GitHub profile
85
  # if github_url and github_url.strip():
 
146
  return result
147
 
148
 
149
+ # def get_processed_data(linkedin_pdf, github_url, job_post_text, instructions):
150
+ # """
151
+ # Get structured data from all inputs for further processing.
152
 
153
+ # Args:
154
+ # linkedin_pdf: Uploaded LinkedIn resume export PDF file
155
+ # github_url (str): GitHub profile URL
156
+ # job_post_text (str): Job post text content
157
+ # instructions (str): Additional instructions from the user
158
 
159
+ # Returns:
160
+ # dict: Structured data containing all processed information
161
+ # """
162
 
163
+ # job_post_text = job_post_text.strip() if job_post_text and job_post_text.strip() else None
164
+ # instructions = instructions.strip() if instructions and instructions.strip() else None
165
 
166
+ # processed_data = {
167
+ # "linkedin": None,
168
+ # "github": None,
169
+ # "job_post": job_post_text,
170
+ # "user_instructions": instructions,
171
+ # "errors": []
172
+ # }
173
 
174
+ # # Process LinkedIn PDF
175
+ # if linkedin_pdf is not None:
176
+ # file_path = linkedin_pdf.name
177
+ # extraction_result = extract_text_from_linkedin_pdf(file_path)
178
 
179
+ # if extraction_result["status"] == "success":
180
+ # processed_data["linkedin"] = extraction_result
181
 
182
+ # else:
183
+ # processed_data["errors"].append(f"LinkedIn: {extraction_result['message']}")
184
 
185
+ # # Process GitHub profile
186
+ # if github_url and github_url.strip():
187
+ # github_result = get_github_repositories(github_url)
188
 
189
+ # if github_result["status"] == "success":
190
+ # processed_data["github"] = github_result
191
 
192
+ # else:
193
+ # processed_data["errors"].append(f"GitHub: {github_result['message']}")
194
 
195
+ # return processed_data
functions/job_call.py CHANGED
@@ -64,7 +64,7 @@ def summarize_job_call(job_call: str) -> str:
64
 
65
  if not job_call or not job_call.strip():
66
  logger.warning("No job call text provided for summarization")
67
-
68
  return None
69
 
70
  logger.info("Summarizing job call (%d characters)", len(job_call))
 
64
 
65
  if not job_call or not job_call.strip():
66
  logger.warning("No job call text provided for summarization")
67
+
68
  return None
69
 
70
  logger.info("Summarizing job call (%d characters)", len(job_call))
functions/linkedin_resume.py CHANGED
@@ -8,35 +8,18 @@ GitHub profiles, and job posting text.
8
  import re
9
  import logging
10
  import io
11
- import os
12
  import json
 
13
  from pathlib import Path
14
  from datetime import datetime
15
  import PyPDF2
16
 
17
- # pylint: disable=broad-exception-caught
18
-
19
- # Set up logging
20
- logging.basicConfig(level=logging.INFO)
21
- logger = logging.getLogger(__name__)
22
-
23
-
24
- def check_default_linkedin_pdf():
25
- """Check if default LinkedIn PDF exists in data directory."""
26
 
27
- # Get the project root directory (parent of functions directory)
28
- project_root = Path(__file__).parent.parent
29
- default_pdf = f'{project_root}/data/linkedin_profile.pdf'
30
-
31
- if not Path(default_pdf).exists():
32
- logger.warning("Default LinkedIn PDF not found at %s", default_pdf)
33
-
34
- return False, None
35
-
36
- return True, default_pdf
37
 
38
 
39
- def extract_text_from_linkedin_pdf(pdf_file) -> dict:
40
  """
41
  Extract and structure text content from an uploaded LinkedIn resume export PDF file
42
  for optimal LLM processing.
@@ -49,27 +32,22 @@ def extract_text_from_linkedin_pdf(pdf_file) -> dict:
49
 
50
  Example:
51
  {
52
- "status": "success",
53
- "structured_text": {
54
- "sections": {...},
55
- "full_text": "...",
56
- "llm_formatted": "...",
57
- "summary": "..."
58
- },
59
- "metadata": {...}
60
  }
61
  """
62
- if pdf_file is None:
63
- return {"status": "error", "message": "No PDF file provided"}
64
 
65
  try:
66
- # Get filename from path
67
- filename = os.path.basename(pdf_file)
68
 
69
  # Read the PDF file from the file path
70
  with open(pdf_file, 'rb') as file:
71
  file_content = file.read()
72
- file_size = len(file_content)
73
 
74
  # Create PDF reader from the file content
75
  pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_content))
@@ -77,6 +55,7 @@ def extract_text_from_linkedin_pdf(pdf_file) -> dict:
77
  # Extract text from all pages
78
  extracted_text = ""
79
  num_pages = len(pdf_reader.pages)
 
80
 
81
  for page_num in range(num_pages):
82
  try:
@@ -89,38 +68,15 @@ def extract_text_from_linkedin_pdf(pdf_file) -> dict:
89
 
90
  continue
91
 
 
 
92
  # Clean and structure the extracted text for LLM consumption
93
- structured_content = _structure_resume_text(extracted_text)
94
-
95
- if not structured_content["full_text"].strip():
96
- return {
97
- "status": "warning",
98
- "structured_text": structured_content,
99
- "metadata": {
100
- "filename": filename,
101
- "file_size": file_size,
102
- "pages": num_pages
103
- },
104
- "message": "PDF processed but no text content was extracted"
105
- }
106
-
107
- logger.info(
108
- "Successfully extracted and structured %d characters from %s",
109
- len(structured_content['full_text']),
110
- filename
111
- )
112
-
113
- result = {
114
- "status": "success",
115
- "structured_text": structured_content,
116
- "metadata": {
117
- "filename": filename,
118
- "file_size": file_size,
119
- "pages": num_pages,
120
- "sections_found": list(structured_content["sections"].keys())
121
- },
122
- "message": f"Text extracted and structured successfully from {num_pages} pages"
123
- }
124
 
125
  # Save results to JSON file
126
  try:
@@ -132,27 +88,22 @@ def extract_text_from_linkedin_pdf(pdf_file) -> dict:
132
  output_file = linkedin_profile_dir / f"linkedin_resume_{timestamp}.json"
133
 
134
  with open(output_file, 'w', encoding='utf-8') as f:
135
- json.dump(result, f, indent=2, ensure_ascii=False)
136
-
137
- logger.info("LinkedIn resume extraction saved to %s", output_file)
138
 
139
  except Exception as save_error:
140
  logger.warning("Failed to save LinkedIn resume extraction to file: %s", str(save_error))
141
 
142
- return result
143
 
144
  except Exception as e:
145
  logger.error("Error processing PDF file: %s", str(e))
146
 
147
- return {
148
- "status": "error",
149
- "message": f"Failed to extract text from PDF: {str(e)}"
150
- }
151
 
152
 
153
- def _structure_resume_text(text: str) -> dict:
154
  """
155
- Structure resume text into logical sections for optimal LLM processing.
156
 
157
  Args:
158
  text (str): Raw extracted text from PDF
@@ -161,31 +112,20 @@ def _structure_resume_text(text: str) -> dict:
161
  dict: Structured text with sections, full text, and summary
162
  """
163
  if not text:
164
- return {
165
- "sections": {},
166
- "full_text": "",
167
- "llm_formatted": "",
168
- "summary": "",
169
- "format": "structured_resume",
170
- "word_count": 0,
171
- "section_count": 0
172
- }
173
-
174
- # Clean the text first
175
- cleaned_text = _clean_extracted_text(text)
176
 
177
  # Define section patterns (common LinkedIn export sections)
178
  section_patterns = {
179
  "contact_info": r"(?i)(contact|personal|profile)\s*(?:information)?",
180
  "summary": r"(?i)(summary|about|overview|profile)",
 
181
  "experience": r"(?i)(experience|work|employment|professional)",
182
  "education": r"(?i)(education|academic|university|college|school)",
183
- "skills": r"(?i)(skills|competencies|technologies|technical)",
184
  "certifications": r"(?i)(certification|certificate|license)",
185
  }
186
 
187
  # Split text into lines for processing
188
- lines = cleaned_text.split('\n')
189
  sections = {}
190
  current_section = "general"
191
  current_content = []
@@ -222,145 +162,31 @@ def _structure_resume_text(text: str) -> dict:
222
  if current_content:
223
  sections[current_section] = '\n'.join(current_content)
224
 
225
- # Create a structured summary for LLM context
226
- summary_parts = []
227
-
228
- if "contact_info" in sections:
229
- summary_parts.append(f"CONTACT: {sections['contact_info'][:200]}...")
230
-
231
- if "summary" in sections:
232
- summary_parts.append(f"SUMMARY: {sections['summary']}")
233
-
234
- if "experience" in sections:
235
- summary_parts.append(f"EXPERIENCE: {sections['experience'][:300]}...")
236
-
237
- if "education" in sections:
238
- summary_parts.append(f"EDUCATION: {sections['education']}")
239
-
240
- if "skills" in sections:
241
- summary_parts.append(f"SKILLS: {sections['skills']}")
242
-
243
- # Create LLM-optimized format
244
- llm_formatted_text = _format_for_llm(sections)
245
-
246
- return {
247
- "sections": sections,
248
- "full_text": cleaned_text,
249
- "llm_formatted": llm_formatted_text,
250
- "summary": '\n\n'.join(summary_parts),
251
- "format": "structured_resume",
252
- "word_count": len(cleaned_text.split()),
253
- "section_count": len(sections)
254
- }
255
-
256
-
257
- def _format_for_llm(sections: dict) -> str:
258
- """
259
- Format the resume sections in an optimal way for LLM processing.
260
-
261
- Args:
262
- sections (dict): Structured sections
263
- full_text (str): Full cleaned text
264
-
265
- Returns:
266
- str: LLM-optimized formatted text
267
- """
268
- formatted_parts = ["=== RESUME CONTENT ===\n"]
269
-
270
- # Prioritize sections in logical order for LLM
271
- priority_order = ["summary", "contact_info", "experience", "education", "skills",
272
- "certifications", "projects", "achievements", "languages", "volunteer"]
273
-
274
- # Add prioritized sections
275
- for section_name in priority_order:
276
- if section_name in sections:
277
-
278
- formatted_parts.append(f"[{section_name.upper().replace('_', ' ')}]")
279
- formatted_parts.append(sections[section_name])
280
- formatted_parts.append("") # Empty line between sections
281
-
282
- # Add any remaining sections
283
  for section_name, content in sections.items():
284
- if section_name not in priority_order and section_name != "general":
285
-
286
- formatted_parts.append(f"[{section_name.upper().replace('_', ' ')}]")
287
- formatted_parts.append(content)
288
- formatted_parts.append("")
289
-
290
- # Add general content if exists
291
- if "general" in sections:
292
 
293
- formatted_parts.append("[ADDITIONAL INFORMATION]")
294
- formatted_parts.append(sections["general"])
295
 
296
- formatted_parts.append("\n=== END RESUME ===")
297
 
298
- return '\n'.join(formatted_parts)
299
-
300
-
301
- def _clean_extracted_text(text: str) -> str:
302
  """
303
- Clean and normalize extracted text from PDF for better LLM processing.
304
 
305
  Args:
306
- text (str): Raw extracted text
307
 
308
  Returns:
309
- str: Cleaned text optimized for LLM consumption
310
  """
311
- if not text:
312
- return ""
313
-
314
- # Remove excessive whitespace and normalize line endings
315
- text = re.sub(r'\r\n', '\n', text)
316
- text = re.sub(r'\r', '\n', text)
317
-
318
- # Split into lines and clean each line
319
- lines = text.split('\n')
320
- cleaned_lines = []
321
-
322
- for line in lines:
323
 
324
- # Strip whitespace
325
- cleaned_line = line.strip()
326
 
327
- # Skip empty lines and very short lines (likely artifacts)
328
- if len(cleaned_line) < 2:
329
- continue
330
-
331
- # Remove common PDF artifacts
332
- cleaned_line = re.sub(r'^\d+$', '', cleaned_line) # Page numbers
333
- cleaned_line = re.sub(r'^[|\-_=]+$', '', cleaned_line) # Separator lines
334
-
335
- if cleaned_line:
336
- cleaned_lines.append(cleaned_line)
337
-
338
- # Join lines and normalize spacing
339
- cleaned_text = '\n'.join(cleaned_lines)
340
-
341
- # Normalize multiple spaces to single spaces
342
- cleaned_text = re.sub(r' +', ' ', cleaned_text)
343
-
344
- # Normalize multiple newlines to maximum of 2
345
- cleaned_text = re.sub(r'\n{3,}', '\n\n', cleaned_text)
346
-
347
- return cleaned_text.strip()
348
-
349
-
350
- def get_llm_context_from_resume(extraction_result: dict) -> str:
351
- """
352
- Extract the best formatted text for LLM context from the extraction result.
353
-
354
- Args:
355
- extraction_result (dict): Result from extract_text_from_linkedin_pdf
356
-
357
- Returns:
358
- str: Formatted text ready for LLM context
359
- """
360
- if extraction_result.get("status") != "success":
361
- return ""
362
 
363
- structured_text = extraction_result.get("structured_text", {})
 
364
 
365
- # Return the LLM-formatted version if available, otherwise fall back to full text
366
- return structured_text.get("llm_formatted", structured_text.get("full_text", ""))
 
8
  import re
9
  import logging
10
  import io
 
11
  import json
12
+ import unicodedata
13
  from pathlib import Path
14
  from datetime import datetime
15
  import PyPDF2
16
 
17
+ from functions.helper import clean_text_whitespace
 
 
 
 
 
 
 
 
18
 
19
+ # pylint: disable=broad-exception-caught
 
 
 
 
 
 
 
 
 
20
 
21
 
22
+ def extract_text(pdf_file: str) -> dict:
23
  """
24
  Extract and structure text content from an uploaded LinkedIn resume export PDF file
25
  for optimal LLM processing.
 
32
 
33
  Example:
34
  {
35
+ "contact_info": "...",
36
+ "summary": "...",
37
+ "skills": "...",
38
+ "experience": "...",
39
+ "education": "...",
40
+ "certifications": "...",
 
 
41
  }
42
  """
43
+
44
+ logger = logging.getLogger(f'{__name__}.extract_text')
45
 
46
  try:
 
 
47
 
48
  # Read the PDF file from the file path
49
  with open(pdf_file, 'rb') as file:
50
  file_content = file.read()
 
51
 
52
  # Create PDF reader from the file content
53
  pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_content))
 
55
  # Extract text from all pages
56
  extracted_text = ""
57
  num_pages = len(pdf_reader.pages)
58
+ logger.info("Extracting text from %d pages", num_pages)
59
 
60
  for page_num in range(num_pages):
61
  try:
 
68
 
69
  continue
70
 
71
+ logger.info("Extracted text length: %d characters", len(extracted_text))
72
+
73
  # Clean and structure the extracted text for LLM consumption
74
+ structured_content = _parse_resume_text(extracted_text)
75
+
76
+ if not structured_content:
77
+ return None
78
+
79
+ logger.info("Found sections: %s", list(structured_content.keys()))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
 
81
  # Save results to JSON file
82
  try:
 
88
  output_file = linkedin_profile_dir / f"linkedin_resume_{timestamp}.json"
89
 
90
  with open(output_file, 'w', encoding='utf-8') as f:
91
+ json.dump(structured_content, f, indent=2, ensure_ascii=False)
 
 
92
 
93
  except Exception as save_error:
94
  logger.warning("Failed to save LinkedIn resume extraction to file: %s", str(save_error))
95
 
96
+ return structured_content
97
 
98
  except Exception as e:
99
  logger.error("Error processing PDF file: %s", str(e))
100
 
101
+ return None
 
 
 
102
 
103
 
104
+ def _parse_resume_text(text: str) -> dict:
105
  """
106
+ Parse resume text into logical sections for optimal LLM processing.
107
 
108
  Args:
109
  text (str): Raw extracted text from PDF
 
112
  dict: Structured text with sections, full text, and summary
113
  """
114
  if not text:
115
+ return None
 
 
 
 
 
 
 
 
 
 
 
116
 
117
  # Define section patterns (common LinkedIn export sections)
118
  section_patterns = {
119
  "contact_info": r"(?i)(contact|personal|profile)\s*(?:information)?",
120
  "summary": r"(?i)(summary|about|overview|profile)",
121
+ "skills": r"(?i)(skills|expertise|competencies|proficiencies)",
122
  "experience": r"(?i)(experience|work|employment|professional)",
123
  "education": r"(?i)(education|academic|university|college|school)",
 
124
  "certifications": r"(?i)(certification|certificate|license)",
125
  }
126
 
127
  # Split text into lines for processing
128
+ lines = text.split('\n')
129
  sections = {}
130
  current_section = "general"
131
  current_content = []
 
162
  if current_content:
163
  sections[current_section] = '\n'.join(current_content)
164
 
165
+ # Clean each section
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
  for section_name, content in sections.items():
167
+ sections[section_name] = _clean_section(content)
 
 
 
 
 
 
 
168
 
169
+ return sections
 
170
 
 
171
 
172
+ def _clean_section(text: str) -> str:
 
 
 
173
  """
174
+ Clean a section of text by normalizing whitespace and removing unnecessary characters.
175
 
176
  Args:
177
+ text (str): The text section to clean
178
 
179
  Returns:
180
+ str: Cleaned text section
181
  """
 
 
 
 
 
 
 
 
 
 
 
 
182
 
183
+ # Normalize unicode characters to avoid issues with special characters
184
+ text = unicodedata.normalize('NFKC', text)
185
 
186
+ # Remove `Page n of n` added by linkedin export
187
+ text = re.sub(r'Page \d+ of \d+', '', text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188
 
189
+ # Clean redundant whitespace
190
+ text = clean_text_whitespace(text)
191
 
192
+ return text.strip()