File size: 5,535 Bytes
f1fa456
36e38f5
 
 
 
 
 
 
 
 
b9464fb
f70c1ff
b9464fb
bef6750
36e38f5
 
f70c1ff
10f94c1
f70c1ff
10f94c1
 
f70c1ff
36e38f5
 
 
bef6750
36e38f5
 
bef6750
36e38f5
 
bef6750
36e38f5
 
f70c1ff
 
 
 
 
 
36e38f5
 
f70c1ff
 
b9464fb
36e38f5
b9464fb
36e38f5
 
 
b9464fb
36e38f5
 
b9464fb
36e38f5
 
 
f70c1ff
b9464fb
36e38f5
 
 
 
 
b9464fb
f9a80bc
b9464fb
 
36e38f5
b9464fb
f70c1ff
 
36e38f5
f70c1ff
 
 
 
 
 
b9464fb
 
 
bef6750
 
 
 
 
 
b9464fb
 
f70c1ff
b9464fb
f9a80bc
b9464fb
 
f70c1ff
b9464fb
f9a80bc
b9464fb
 
f70c1ff
36e38f5
 
f70c1ff
36e38f5
f70c1ff
bef6750
36e38f5
 
bef6750
36e38f5
 
 
 
f70c1ff
b9464fb
36e38f5
 
 
 
f70c1ff
36e38f5
 
 
 
b9464fb
36e38f5
f70c1ff
36e38f5
 
 
b9464fb
36e38f5
 
b9464fb
36e38f5
 
b9464fb
36e38f5
 
b9464fb
36e38f5
 
b9464fb
36e38f5
 
b9464fb
36e38f5
b9464fb
36e38f5
 
 
b9464fb
36e38f5
 
 
b9464fb
36e38f5
 
b9464fb
36e38f5
 
 
b9464fb
f70c1ff
36e38f5
f70c1ff
b9464fb
f70c1ff
b9464fb
 
f70c1ff
36e38f5
f70c1ff
bef6750
36e38f5
f70c1ff
bef6750
36e38f5
f70c1ff
36e38f5
b9464fb
f70c1ff
 
b9464fb
f70c1ff
 
b9464fb
f70c1ff
 
b9464fb
f70c1ff
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
"""
context_acquisition.py

Functions for acquiring context from various sources including PDF text extraction,
GitHub profiles, and job posting text.
"""

import re
import logging
import io
import json
import unicodedata
from pathlib import Path
from datetime import datetime
import PyPDF2

from functions.helper import clean_text_whitespace

# pylint: disable=broad-exception-caught


def extract_text(pdf_file: str) -> dict:
    """
    Extract and structure text content from an uploaded LinkedIn resume export PDF file
    for optimal LLM processing.

    Args:
        pdf_file: The file path string to the uploaded PDF file

    Returns:
        dict: Dictionary containing extraction status, structured text content, and metadata

    Example:
        {
            "contact_info": "...",
            "summary": "...",
            "skills": "...",
            "experience": "...",
            "education": "...",
            "certifications": "...",
        }
    """

    logger = logging.getLogger(f'{__name__}.extract_text')

    try:

        # Read the PDF file from the file path
        with open(pdf_file, 'rb') as file:
            file_content = file.read()

        # Create PDF reader from the file content
        pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_content))

        # Extract text from all pages
        extracted_text = ""
        num_pages = len(pdf_reader.pages)
        logger.info("Extracting text from %d pages", num_pages)

        for page_num in range(num_pages):
            try:
                page = pdf_reader.pages[page_num]
                page_text = page.extract_text()
                extracted_text += page_text + "\n\n"

            except Exception as e:
                logger.warning("Error extracting text from page %d: %s", page_num + 1, str(e))

                continue

        logger.info("Extracted text length: %d characters", len(extracted_text))

        # Clean and structure the extracted text for LLM consumption
        structured_content = _parse_resume_text(extracted_text)

        if not structured_content:
            return None

        logger.info("Found sections: %s", list(structured_content.keys()))

        # Save results to JSON file
        try:
            linkedin_profile_dir = Path(__file__).parent.parent / "data" / "linkedin_profile"
            linkedin_profile_dir.mkdir(parents=True, exist_ok=True)

            # Create timestamped filename
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            output_file = linkedin_profile_dir / f"linkedin_resume_{timestamp}.json"

            with open(output_file, 'w', encoding='utf-8') as f:
                json.dump(structured_content, f, indent=2, ensure_ascii=False)

        except Exception as save_error:
            logger.warning("Failed to save LinkedIn resume extraction to file: %s", str(save_error))

        return structured_content

    except Exception as e:
        logger.error("Error processing PDF file: %s", str(e))

        return None


def _parse_resume_text(text: str) -> dict:
    """
    Parse resume text into logical sections for optimal LLM processing.

    Args:
        text (str): Raw extracted text from PDF

    Returns:
        dict: Structured text with sections, full text, and summary
    """
    if not text:
        return None

    # Define section patterns (common LinkedIn export sections)
    section_patterns = {
        "contact_info": r"(?i)(contact|personal|profile)\s*(?:information)?",
        "summary": r"(?i)(summary|about|overview|profile)",
        "skills": r"(?i)(skills|expertise|competencies|proficiencies)",
        "experience": r"(?i)(experience|work|employment|professional)",
        "education": r"(?i)(education|academic|university|college|school)",
        "certifications": r"(?i)(certification|certificate|license)",
    }

    # Split text into lines for processing
    lines = text.split('\n')
    sections = {}
    current_section = "general"
    current_content = []

    for line in lines:
        line = line.strip()

        if not line:
            continue

        # Check if line is a section header
        section_found = None

        for section_name, pattern in section_patterns.items():
            if re.match(pattern, line):

                section_found = section_name
                break

        if section_found:

            # Save previous section content
            if current_content:
                sections[current_section] = '\n'.join(current_content)

            # Start new section
            current_section = section_found
            current_content = [line]

        else:
            current_content.append(line)

    # Save the last section
    if current_content:
        sections[current_section] = '\n'.join(current_content)

    # Clean each section
    for section_name, content in sections.items():
        sections[section_name] = _clean_section(content)

    return sections


def _clean_section(text: str) -> str:
    """
    Clean a section of text by normalizing whitespace and removing unnecessary characters.

    Args:
        text (str): The text section to clean

    Returns:
        str: Cleaned text section
    """

    # Normalize unicode characters to avoid issues with special characters
    text = unicodedata.normalize('NFKC', text)

    # Remove `Page n of n` added by linkedin export
    text = re.sub(r'Page \d+ of \d+', '', text)

    # Clean redundant whitespace
    text = clean_text_whitespace(text)

    return text.strip()