Spaces:

gperdrizet
/

resumate

Configuration error

App Files Files Community

gperdrizet commited on Jul 23

Commit

f70c1ff

verified ·

1 Parent(s): e55b547

Cleaned up LinkedIn resume PDF text extraction and parsing

Browse files

Files changed (3) hide show

functions/gradio.py +53 -60
functions/job_call.py +1 -1
functions/linkedin_resume.py +43 -217

functions/gradio.py CHANGED Viewed

@@ -7,8 +7,8 @@ Functions for handling Gradio UI interactions and processing user inputs.
 import logging
 from pathlib import Path
 from functions.helper import clean_text_whitespace
-from functions.linkedin_resume import extract_text_from_linkedin_pdf
-from functions.github import get_github_repositories
 # from functions.job_call import summarize_job_call
 # from functions.writer_agent import write_resume
@@ -60,33 +60,26 @@ def process_inputs(
     logger.info("User instructions: %s", user_instructions[:100] if user_instructions else "None")
     result = ""
-    # extraction_result = None
-    # logger.info("Processing user inputs from Gradio interface")
-    # # Process LinkedIn PDF file
-    # if linkedin_pdf is not None:
-    #     file_path = linkedin_pdf.name
-    #     file_display_name = Path(file_path).name
-    #     result += "✅ LinkedIn Resume PDF provided\n"
-    #     logger.info("Processing LinkedIn PDF: %s", file_display_name)
-    #     # Extract and structure text from the PDF
-    #     extraction_result = extract_text_from_linkedin_pdf(file_path)
-    #     if extraction_result["status"] == "success":
-    #         result += " ✅ Text extraction successful\n\n"
-    #         logger.info("LinkedIn PDF text extraction successful")
-    #     elif extraction_result["status"] == "warning":
-    #         result += f" ⚠️  Text extraction: {extraction_result['message']}\n\n"
-    #         logger.warning("LinkedIn PDF extraction warning: %s", extraction_result['message'])
-    #     else:
-    #         result += f" ❌ Text extraction failed: {extraction_result['message']}\n\n"
-    #         logger.error("LinkedIn PDF extraction failed: %s", extraction_result['message'])
     # else:
-    #     result += "❌ No LinkedIn resume PDF file uploaded\n\n"
-    #     logger.info("No LinkedIn PDF file provided")
     # # Process GitHub profile
     # if github_url and github_url.strip():
@@ -153,50 +146,50 @@ def process_inputs(
     return result
-def get_processed_data(linkedin_pdf, github_url, job_post_text, instructions):
-    """
-    Get structured data from all inputs for further processing.
-    Args:
-        linkedin_pdf: Uploaded LinkedIn resume export PDF file
-        github_url (str): GitHub profile URL
-        job_post_text (str): Job post text content
-        instructions (str): Additional instructions from the user
-    Returns:
-        dict: Structured data containing all processed information
-    """
-    job_post_text = job_post_text.strip() if job_post_text and job_post_text.strip() else None
-    instructions = instructions.strip() if instructions and instructions.strip() else None
-    processed_data = {
-        "linkedin": None,
-        "github": None,
-        "job_post": job_post_text,
-        "user_instructions": instructions,
-        "errors": []
-    }
-    # Process LinkedIn PDF
-    if linkedin_pdf is not None:
-        file_path = linkedin_pdf.name
-        extraction_result = extract_text_from_linkedin_pdf(file_path)
-        if extraction_result["status"] == "success":
-            processed_data["linkedin"] = extraction_result
-        else:
-            processed_data["errors"].append(f"LinkedIn: {extraction_result['message']}")
-    # Process GitHub profile
-    if github_url and github_url.strip():
-        github_result = get_github_repositories(github_url)
-        if github_result["status"] == "success":
-            processed_data["github"] = github_result
-        else:
-            processed_data["errors"].append(f"GitHub: {github_result['message']}")
-    return processed_data

 import logging
 from pathlib import Path
 from functions.helper import clean_text_whitespace
+from functions.linkedin_resume import extract_text
+# from functions.github import get_github_repositories
 # from functions.job_call import summarize_job_call
 # from functions.writer_agent import write_resume
     logger.info("User instructions: %s", user_instructions[:100] if user_instructions else "None")
     result = ""
+    # Extract and structure text from the linkedin profile PDF
+    logger.info("Extracting text from LinkedIn PDF: %s", linkedin_pdf_path)
+    extraction_result = extract_text(linkedin_pdf_path)
+    if extraction_result:
+        logger.info("LinkedIn PDF text extraction successful")
+    else:
+        logger.error("LinkedIn PDF text extraction failed")
+    # if extraction_result["status"] == "success":
+    #     result += " ✅ Text extraction successful\n\n"
+    #     logger.info("LinkedIn PDF text extraction successful")
+    # elif extraction_result["status"] == "warning":
+    #     result += f" ⚠️  Text extraction: {extraction_result['message']}\n\n"
+    #     logger.warning("LinkedIn PDF extraction warning: %s", extraction_result['message'])
     # else:
+    #     result += f" ❌ Text extraction failed: {extraction_result['message']}\n\n"
+    #     logger.error("LinkedIn PDF extraction failed: %s", extraction_result['message'])
     # # Process GitHub profile
     # if github_url and github_url.strip():
     return result
+# def get_processed_data(linkedin_pdf, github_url, job_post_text, instructions):
+#     """
+#     Get structured data from all inputs for further processing.
+#     Args:
+#         linkedin_pdf: Uploaded LinkedIn resume export PDF file
+#         github_url (str): GitHub profile URL
+#         job_post_text (str): Job post text content
+#         instructions (str): Additional instructions from the user
+#     Returns:
+#         dict: Structured data containing all processed information
+#     """
+#     job_post_text = job_post_text.strip() if job_post_text and job_post_text.strip() else None
+#     instructions = instructions.strip() if instructions and instructions.strip() else None
+#     processed_data = {
+#         "linkedin": None,
+#         "github": None,
+#         "job_post": job_post_text,
+#         "user_instructions": instructions,
+#         "errors": []
+#     }
+#     # Process LinkedIn PDF
+#     if linkedin_pdf is not None:
+#         file_path = linkedin_pdf.name
+#         extraction_result = extract_text_from_linkedin_pdf(file_path)
+#         if extraction_result["status"] == "success":
+#             processed_data["linkedin"] = extraction_result
+#         else:
+#             processed_data["errors"].append(f"LinkedIn: {extraction_result['message']}")
+#     # Process GitHub profile
+#     if github_url and github_url.strip():
+#         github_result = get_github_repositories(github_url)
+#         if github_result["status"] == "success":
+#             processed_data["github"] = github_result
+#         else:
+#             processed_data["errors"].append(f"GitHub: {github_result['message']}")
+#     return processed_data

functions/job_call.py CHANGED Viewed

@@ -64,7 +64,7 @@ def summarize_job_call(job_call: str) -> str:
     if not job_call or not job_call.strip():
         logger.warning("No job call text provided for summarization")
         return None
     logger.info("Summarizing job call (%d characters)", len(job_call))

     if not job_call or not job_call.strip():
         logger.warning("No job call text provided for summarization")
         return None
     logger.info("Summarizing job call (%d characters)", len(job_call))

functions/linkedin_resume.py CHANGED Viewed

@@ -8,35 +8,18 @@ GitHub profiles, and job posting text.
 import re
 import logging
 import io
-import os
 import json
 from pathlib import Path
 from datetime import datetime
 import PyPDF2
-# pylint: disable=broad-exception-caught
-# Set up logging
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-def check_default_linkedin_pdf():
-    """Check if default LinkedIn PDF exists in data directory."""
-    # Get the project root directory (parent of functions directory)
-    project_root = Path(__file__).parent.parent
-    default_pdf = f'{project_root}/data/linkedin_profile.pdf'
-    if not Path(default_pdf).exists():
-        logger.warning("Default LinkedIn PDF not found at %s", default_pdf)
-        return False, None
-    return True, default_pdf
-def extract_text_from_linkedin_pdf(pdf_file) -> dict:
     """
     Extract and structure text content from an uploaded LinkedIn resume export PDF file
     for optimal LLM processing.
@@ -49,27 +32,22 @@ def extract_text_from_linkedin_pdf(pdf_file) -> dict:
     Example:
         {
-            "status": "success",
-            "structured_text": {
-                "sections": {...},
-                "full_text": "...",
-                "llm_formatted": "...",
-                "summary": "..."
-            },
-            "metadata": {...}
         }
     """
-    if pdf_file is None:
-        return {"status": "error", "message": "No PDF file provided"}
     try:
-        # Get filename from path
-        filename = os.path.basename(pdf_file)
         # Read the PDF file from the file path
         with open(pdf_file, 'rb') as file:
             file_content = file.read()
-            file_size = len(file_content)
         # Create PDF reader from the file content
         pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_content))
@@ -77,6 +55,7 @@ def extract_text_from_linkedin_pdf(pdf_file) -> dict:
         # Extract text from all pages
         extracted_text = ""
         num_pages = len(pdf_reader.pages)
         for page_num in range(num_pages):
             try:
@@ -89,38 +68,15 @@ def extract_text_from_linkedin_pdf(pdf_file) -> dict:
                 continue
         # Clean and structure the extracted text for LLM consumption
-        structured_content = _structure_resume_text(extracted_text)
-        if not structured_content["full_text"].strip():
-            return {
-                "status": "warning",
-                "structured_text": structured_content,
-                "metadata": {
-                    "filename": filename,
-                    "file_size": file_size,
-                    "pages": num_pages
-                },
-                "message": "PDF processed but no text content was extracted"
-            }
-        logger.info(
-            "Successfully extracted and structured %d characters from %s",
-            len(structured_content['full_text']),
-            filename
-        )
-        result = {
-            "status": "success",
-            "structured_text": structured_content,
-            "metadata": {
-                "filename": filename,
-                "file_size": file_size,
-                "pages": num_pages,
-                "sections_found": list(structured_content["sections"].keys())
-            },
-            "message": f"Text extracted and structured successfully from {num_pages} pages"
-        }
         # Save results to JSON file
         try:
@@ -132,27 +88,22 @@ def extract_text_from_linkedin_pdf(pdf_file) -> dict:
             output_file = linkedin_profile_dir / f"linkedin_resume_{timestamp}.json"
             with open(output_file, 'w', encoding='utf-8') as f:
-                json.dump(result, f, indent=2, ensure_ascii=False)
-            logger.info("LinkedIn resume extraction saved to %s", output_file)
         except Exception as save_error:
             logger.warning("Failed to save LinkedIn resume extraction to file: %s", str(save_error))
-        return result
     except Exception as e:
         logger.error("Error processing PDF file: %s", str(e))
-        return {
-            "status": "error",
-            "message": f"Failed to extract text from PDF: {str(e)}"
-        }
-def _structure_resume_text(text: str) -> dict:
     """
-    Structure resume text into logical sections for optimal LLM processing.
     Args:
         text (str): Raw extracted text from PDF
@@ -161,31 +112,20 @@ def _structure_resume_text(text: str) -> dict:
         dict: Structured text with sections, full text, and summary
     """
     if not text:
-        return {
-            "sections": {},
-            "full_text": "",
-            "llm_formatted": "",
-            "summary": "",
-            "format": "structured_resume",
-            "word_count": 0,
-            "section_count": 0
-        }
-    # Clean the text first
-    cleaned_text = _clean_extracted_text(text)
     # Define section patterns (common LinkedIn export sections)
     section_patterns = {
         "contact_info": r"(?i)(contact|personal|profile)\s*(?:information)?",
         "summary": r"(?i)(summary|about|overview|profile)",
         "experience": r"(?i)(experience|work|employment|professional)",
         "education": r"(?i)(education|academic|university|college|school)",
-        "skills": r"(?i)(skills|competencies|technologies|technical)",
         "certifications": r"(?i)(certification|certificate|license)",
     }
     # Split text into lines for processing
-    lines = cleaned_text.split('\n')
     sections = {}
     current_section = "general"
     current_content = []
@@ -222,145 +162,31 @@ def _structure_resume_text(text: str) -> dict:
     if current_content:
         sections[current_section] = '\n'.join(current_content)
-    # Create a structured summary for LLM context
-    summary_parts = []
-    if "contact_info" in sections:
-        summary_parts.append(f"CONTACT: {sections['contact_info'][:200]}...")
-    if "summary" in sections:
-        summary_parts.append(f"SUMMARY: {sections['summary']}")
-    if "experience" in sections:
-        summary_parts.append(f"EXPERIENCE: {sections['experience'][:300]}...")
-    if "education" in sections:
-        summary_parts.append(f"EDUCATION: {sections['education']}")
-    if "skills" in sections:
-        summary_parts.append(f"SKILLS: {sections['skills']}")
-    # Create LLM-optimized format
-    llm_formatted_text = _format_for_llm(sections)
-    return {
-        "sections": sections,
-        "full_text": cleaned_text,
-        "llm_formatted": llm_formatted_text,
-        "summary": '\n\n'.join(summary_parts),
-        "format": "structured_resume",
-        "word_count": len(cleaned_text.split()),
-        "section_count": len(sections)
-    }
-def _format_for_llm(sections: dict) -> str:
-    """
-    Format the resume sections in an optimal way for LLM processing.
-    Args:
-        sections (dict): Structured sections
-        full_text (str): Full cleaned text
-    Returns:
-        str: LLM-optimized formatted text
-    """
-    formatted_parts = ["=== RESUME CONTENT ===\n"]
-    # Prioritize sections in logical order for LLM
-    priority_order = ["summary", "contact_info", "experience", "education", "skills",
-                     "certifications", "projects", "achievements", "languages", "volunteer"]
-    # Add prioritized sections
-    for section_name in priority_order:
-        if section_name in sections:
-            formatted_parts.append(f"[{section_name.upper().replace('_', ' ')}]")
-            formatted_parts.append(sections[section_name])
-            formatted_parts.append("")  # Empty line between sections
-    # Add any remaining sections
     for section_name, content in sections.items():
-        if section_name not in priority_order and section_name != "general":
-            formatted_parts.append(f"[{section_name.upper().replace('_', ' ')}]")
-            formatted_parts.append(content)
-            formatted_parts.append("")
-    # Add general content if exists
-    if "general" in sections:
-        formatted_parts.append("[ADDITIONAL INFORMATION]")
-        formatted_parts.append(sections["general"])
-    formatted_parts.append("\n=== END RESUME ===")
-    return '\n'.join(formatted_parts)
-def _clean_extracted_text(text: str) -> str:
     """
-    Clean and normalize extracted text from PDF for better LLM processing.
     Args:
-        text (str): Raw extracted text
     Returns:
-        str: Cleaned text optimized for LLM consumption
     """
-    if not text:
-        return ""
-    # Remove excessive whitespace and normalize line endings
-    text = re.sub(r'\r\n', '\n', text)
-    text = re.sub(r'\r', '\n', text)
-    # Split into lines and clean each line
-    lines = text.split('\n')
-    cleaned_lines = []
-    for line in lines:
-        # Strip whitespace
-        cleaned_line = line.strip()
-        # Skip empty lines and very short lines (likely artifacts)
-        if len(cleaned_line) < 2:
-            continue
-        # Remove common PDF artifacts
-        cleaned_line = re.sub(r'^\d+$', '', cleaned_line)  # Page numbers
-        cleaned_line = re.sub(r'^[|\-_=]+$', '', cleaned_line)  # Separator lines
-        if cleaned_line:
-            cleaned_lines.append(cleaned_line)
-    # Join lines and normalize spacing
-    cleaned_text = '\n'.join(cleaned_lines)
-    # Normalize multiple spaces to single spaces
-    cleaned_text = re.sub(r' +', ' ', cleaned_text)
-    # Normalize multiple newlines to maximum of 2
-    cleaned_text = re.sub(r'\n{3,}', '\n\n', cleaned_text)
-    return cleaned_text.strip()
-def get_llm_context_from_resume(extraction_result: dict) -> str:
-    """
-    Extract the best formatted text for LLM context from the extraction result.
-    Args:
-        extraction_result (dict): Result from extract_text_from_linkedin_pdf
-    Returns:
-        str: Formatted text ready for LLM context
-    """
-    if extraction_result.get("status") != "success":
-        return ""
-    structured_text = extraction_result.get("structured_text", {})
-    # Return the LLM-formatted version if available, otherwise fall back to full text
-    return structured_text.get("llm_formatted", structured_text.get("full_text", ""))

 import re
 import logging
 import io
 import json
+import unicodedata
 from pathlib import Path
 from datetime import datetime
 import PyPDF2
+from functions.helper import clean_text_whitespace
+# pylint: disable=broad-exception-caught
+def extract_text(pdf_file: str) -> dict:
     """
     Extract and structure text content from an uploaded LinkedIn resume export PDF file
     for optimal LLM processing.
     Example:
         {
+            "contact_info": "...",
+            "summary": "...",
+            "skills": "...",
+            "experience": "...",
+            "education": "...",
+            "certifications": "...",
         }
     """
+    logger = logging.getLogger(f'{__name__}.extract_text')
     try:
         # Read the PDF file from the file path
         with open(pdf_file, 'rb') as file:
             file_content = file.read()
         # Create PDF reader from the file content
         pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_content))
         # Extract text from all pages
         extracted_text = ""
         num_pages = len(pdf_reader.pages)
+        logger.info("Extracting text from %d pages", num_pages)
         for page_num in range(num_pages):
             try:
                 continue
+        logger.info("Extracted text length: %d characters", len(extracted_text))
         # Clean and structure the extracted text for LLM consumption
+        structured_content = _parse_resume_text(extracted_text)
+        if not structured_content:
+            return None
+        logger.info("Found sections: %s", list(structured_content.keys()))
         # Save results to JSON file
         try:
             output_file = linkedin_profile_dir / f"linkedin_resume_{timestamp}.json"
             with open(output_file, 'w', encoding='utf-8') as f:
+                json.dump(structured_content, f, indent=2, ensure_ascii=False)
         except Exception as save_error:
             logger.warning("Failed to save LinkedIn resume extraction to file: %s", str(save_error))
+        return structured_content
     except Exception as e:
         logger.error("Error processing PDF file: %s", str(e))
+        return None
+def _parse_resume_text(text: str) -> dict:
     """
+    Parse resume text into logical sections for optimal LLM processing.
     Args:
         text (str): Raw extracted text from PDF
         dict: Structured text with sections, full text, and summary
     """
     if not text:
+        return None
     # Define section patterns (common LinkedIn export sections)
     section_patterns = {
         "contact_info": r"(?i)(contact|personal|profile)\s*(?:information)?",
         "summary": r"(?i)(summary|about|overview|profile)",
+        "skills": r"(?i)(skills|expertise|competencies|proficiencies)",
         "experience": r"(?i)(experience|work|employment|professional)",
         "education": r"(?i)(education|academic|university|college|school)",
         "certifications": r"(?i)(certification|certificate|license)",
     }
     # Split text into lines for processing
+    lines = text.split('\n')
     sections = {}
     current_section = "general"
     current_content = []
     if current_content:
         sections[current_section] = '\n'.join(current_content)
+    # Clean each section
     for section_name, content in sections.items():
+        sections[section_name] = _clean_section(content)
+    return sections
+def _clean_section(text: str) -> str:
     """
+    Clean a section of text by normalizing whitespace and removing unnecessary characters.
     Args:
+        text (str): The text section to clean
     Returns:
+        str: Cleaned text section
     """
+    # Normalize unicode characters to avoid issues with special characters
+    text = unicodedata.normalize('NFKC', text)
+    # Remove `Page n of n` added by linkedin export
+    text = re.sub(r'Page \d+ of \d+', '', text)
+    # Clean redundant whitespace
+    text = clean_text_whitespace(text)
+    return text.strip()