Spaces:
Configuration error
Configuration error
Fixed merge conflict
Browse files- .devcontainer/devcontainer.json +1 -1
- .gitignore +1 -2
- functions/__init__.py +0 -10
- functions/context_acquisition.py +266 -166
- packages.txt +0 -1
- requirements.txt +1 -2
- resumate.py +50 -22
- tests/test_context_acquisition.py +0 -249
.devcontainer/devcontainer.json
CHANGED
@@ -3,7 +3,7 @@
|
|
3 |
{
|
4 |
"name": "Python 3.10: resumate",
|
5 |
"image": "mcr.microsoft.com/devcontainers/python:0-3.11",
|
6 |
-
"onCreateCommand": "sudo apt update && sudo apt upgrade -y &&
|
7 |
"customizations": {
|
8 |
"vscode": {
|
9 |
"extensions": [
|
|
|
3 |
{
|
4 |
"name": "Python 3.10: resumate",
|
5 |
"image": "mcr.microsoft.com/devcontainers/python:0-3.11",
|
6 |
+
"onCreateCommand": "sudo apt update && sudo apt upgrade -y && pip3 install --upgrade pip && pip3 install --user -r requirements.txt",
|
7 |
"customizations": {
|
8 |
"vscode": {
|
9 |
"extensions": [
|
.gitignore
CHANGED
@@ -1,4 +1,3 @@
|
|
1 |
__pycache__
|
2 |
.vscode
|
3 |
-
.venv
|
4 |
-
html
|
|
|
1 |
__pycache__
|
2 |
.vscode
|
3 |
+
.venv
|
|
functions/__init__.py
DELETED
@@ -1,10 +0,0 @@
|
|
1 |
-
"""
|
2 |
-
Functions package for the resumate application.
|
3 |
-
|
4 |
-
This package contains modules for data acquisition, processing, and analysis
|
5 |
-
of LinkedIn profiles, GitHub profiles, and job postings.
|
6 |
-
"""
|
7 |
-
|
8 |
-
from .context_acquisition import get_linkedin_profile_html
|
9 |
-
|
10 |
-
__all__ = ['get_linkedin_profile_html']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
functions/context_acquisition.py
CHANGED
@@ -1,210 +1,310 @@
|
|
1 |
"""
|
2 |
context_acquisition.py
|
3 |
|
4 |
-
Functions for acquiring context from various sources including
|
5 |
-
GitHub profiles, and job
|
6 |
"""
|
7 |
|
8 |
-
import
|
9 |
import logging
|
|
|
10 |
import os
|
11 |
-
|
12 |
-
|
13 |
-
from selenium import webdriver
|
14 |
-
from selenium.webdriver.chrome.options import Options
|
15 |
-
from selenium.webdriver.common.by import By
|
16 |
-
from selenium.webdriver.support.ui import WebDriverWait
|
17 |
-
from selenium.webdriver.support import expected_conditions as EC
|
18 |
-
from selenium.common.exceptions import TimeoutException, WebDriverException
|
19 |
|
20 |
# Set up logging
|
21 |
logging.basicConfig(level=logging.INFO)
|
22 |
logger = logging.getLogger(__name__)
|
23 |
|
24 |
|
25 |
-
def
|
26 |
"""
|
27 |
-
|
28 |
-
|
29 |
|
30 |
Args:
|
31 |
-
|
32 |
-
wait_time (int): Maximum time to wait for page elements to load (default: 10 seconds)
|
33 |
-
|
34 |
-
Returns:
|
35 |
-
str: The HTML content of the LinkedIn profile page
|
36 |
|
37 |
-
|
38 |
-
|
39 |
-
WebDriverException: If there's an issue with the browser automation
|
40 |
-
TimeoutException: If the page takes too long to load
|
41 |
|
42 |
-
|
43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
"""
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
raise ValueError("Profile URL must be a non-empty string")
|
49 |
-
|
50 |
-
if "linkedin.com/in/" not in profile_url:
|
51 |
-
raise ValueError("URL must be a valid LinkedIn profile URL (containing 'linkedin.com/in/')")
|
52 |
-
|
53 |
-
# Configure Chrome options for headless browsing
|
54 |
-
chrome_options = setup_chrome_driver_options()
|
55 |
-
|
56 |
-
driver = None
|
57 |
try:
|
58 |
-
#
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
EC.any_of(
|
75 |
-
EC.presence_of_element_located(( # Profile header
|
76 |
-
By.CSS_SELECTOR,
|
77 |
-
".pv-top-card"
|
78 |
-
)),
|
79 |
-
EC.presence_of_element_located(( # Profile section
|
80 |
-
By.CSS_SELECTOR,
|
81 |
-
".profile-section"
|
82 |
-
)),
|
83 |
-
EC.presence_of_element_located(( # Auth wall
|
84 |
-
By.CSS_SELECTOR,
|
85 |
-
".authwall"
|
86 |
-
)),
|
87 |
-
EC.presence_of_element_located(( # Public profile
|
88 |
-
By.CSS_SELECTOR,
|
89 |
-
".public-profile"
|
90 |
-
)),
|
91 |
-
)
|
92 |
-
)
|
93 |
-
|
94 |
-
except TimeoutException:
|
95 |
-
logger.warning(
|
96 |
-
"Standard LinkedIn elements not found, proceeding with current page state"
|
97 |
-
)
|
98 |
-
|
99 |
-
# Additional wait to ensure dynamic content loads
|
100 |
-
time.sleep(2)
|
101 |
-
|
102 |
-
# Get the page HTML
|
103 |
-
html_content = driver.page_source
|
104 |
-
|
105 |
-
# Clean up HTML by removing blank lines
|
106 |
-
cleaned_html = _clean_html_content(html_content)
|
107 |
-
|
108 |
-
logger.info(
|
109 |
-
"Successfully retrieved HTML content (%d characters, cleaned to %d characters)",
|
110 |
-
len(html_content),
|
111 |
-
len(cleaned_html)
|
112 |
-
)
|
113 |
-
|
114 |
-
# Save HTML content to file
|
115 |
-
_save_html_to_file(cleaned_html, profile_url)
|
116 |
-
|
117 |
-
return cleaned_html
|
118 |
-
|
119 |
-
except WebDriverException as e:
|
120 |
-
logger.error("WebDriver error occurred: %s", str(e))
|
121 |
-
raise WebDriverException(f"Browser automation failed: {str(e)}") from e
|
122 |
-
|
123 |
-
except Exception as e:
|
124 |
-
logger.error("Unexpected error occurred: %s", str(e))
|
125 |
-
raise RuntimeError(f"Failed to retrieve LinkedIn profile: {str(e)}") from e
|
126 |
-
|
127 |
-
finally:
|
128 |
-
# Always clean up the driver
|
129 |
-
if driver:
|
130 |
try:
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
135 |
|
136 |
|
137 |
-
def
|
138 |
"""
|
139 |
-
|
140 |
|
141 |
Args:
|
142 |
-
|
143 |
|
144 |
Returns:
|
145 |
-
|
146 |
"""
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
156 |
"""
|
157 |
-
|
158 |
|
159 |
Args:
|
160 |
-
|
161 |
-
|
162 |
|
163 |
Returns:
|
164 |
-
str:
|
165 |
"""
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
182 |
|
183 |
-
logger.info("HTML content saved to: %s", file_path)
|
184 |
-
return file_path
|
185 |
|
186 |
-
|
187 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
188 |
return ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
189 |
|
190 |
|
191 |
-
def
|
192 |
"""
|
193 |
-
|
194 |
|
|
|
|
|
|
|
195 |
Returns:
|
196 |
-
|
197 |
"""
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
|
206 |
-
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
|
207 |
-
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
|
208 |
-
chrome_options.add_experimental_option('useAutomationExtension', False)
|
209 |
-
|
210 |
-
return chrome_options
|
|
|
1 |
"""
|
2 |
context_acquisition.py
|
3 |
|
4 |
+
Functions for acquiring context from various sources including PDF text extraction,
|
5 |
+
GitHub profiles, and job posting text.
|
6 |
"""
|
7 |
|
8 |
+
import re
|
9 |
import logging
|
10 |
+
import io
|
11 |
import os
|
12 |
+
import PyPDF2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
# Set up logging
|
15 |
logging.basicConfig(level=logging.INFO)
|
16 |
logger = logging.getLogger(__name__)
|
17 |
|
18 |
|
19 |
+
def extract_text_from_linkedin_pdf(pdf_file) -> dict:
|
20 |
"""
|
21 |
+
Extract and structure text content from an uploaded LinkedIn resume export PDF file
|
22 |
+
for optimal LLM processing.
|
23 |
|
24 |
Args:
|
25 |
+
pdf_file: The file path string to the uploaded PDF file
|
|
|
|
|
|
|
|
|
26 |
|
27 |
+
Returns:
|
28 |
+
dict: Dictionary containing extraction status, structured text content, and metadata
|
|
|
|
|
29 |
|
30 |
+
Example:
|
31 |
+
{
|
32 |
+
"status": "success",
|
33 |
+
"structured_text": {
|
34 |
+
"sections": {...},
|
35 |
+
"full_text": "...",
|
36 |
+
"llm_formatted": "...",
|
37 |
+
"summary": "..."
|
38 |
+
},
|
39 |
+
"metadata": {...}
|
40 |
+
}
|
41 |
"""
|
42 |
+
if pdf_file is None:
|
43 |
+
return {"status": "error", "message": "No PDF file provided"}
|
44 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
try:
|
46 |
+
# Get filename from path
|
47 |
+
filename = os.path.basename(pdf_file)
|
48 |
+
|
49 |
+
# Read the PDF file from the file path
|
50 |
+
with open(pdf_file, 'rb') as file:
|
51 |
+
file_content = file.read()
|
52 |
+
file_size = len(file_content)
|
53 |
+
|
54 |
+
# Create PDF reader from the file content
|
55 |
+
pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_content))
|
56 |
+
|
57 |
+
# Extract text from all pages
|
58 |
+
extracted_text = ""
|
59 |
+
num_pages = len(pdf_reader.pages)
|
60 |
+
|
61 |
+
for page_num in range(num_pages):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
try:
|
63 |
+
page = pdf_reader.pages[page_num]
|
64 |
+
page_text = page.extract_text()
|
65 |
+
extracted_text += page_text + "\n\n"
|
66 |
+
except Exception as e:
|
67 |
+
logger.warning(f"Error extracting text from page {page_num + 1}: {str(e)}")
|
68 |
+
continue
|
69 |
+
|
70 |
+
# Clean and structure the extracted text for LLM consumption
|
71 |
+
structured_content = _structure_resume_text(extracted_text)
|
72 |
+
|
73 |
+
if not structured_content["full_text"].strip():
|
74 |
+
return {
|
75 |
+
"status": "warning",
|
76 |
+
"structured_text": structured_content,
|
77 |
+
"metadata": {
|
78 |
+
"filename": filename,
|
79 |
+
"file_size": file_size,
|
80 |
+
"pages": num_pages
|
81 |
+
},
|
82 |
+
"message": "PDF processed but no text content was extracted"
|
83 |
+
}
|
84 |
+
|
85 |
+
logger.info(f"Successfully extracted and structured {len(structured_content['full_text'])} characters from {filename}")
|
86 |
+
|
87 |
+
return {
|
88 |
+
"status": "success",
|
89 |
+
"structured_text": structured_content,
|
90 |
+
"metadata": {
|
91 |
+
"filename": filename,
|
92 |
+
"file_size": file_size,
|
93 |
+
"pages": num_pages,
|
94 |
+
"sections_found": list(structured_content["sections"].keys())
|
95 |
+
},
|
96 |
+
"message": f"Text extracted and structured successfully from {num_pages} pages"
|
97 |
+
}
|
98 |
+
|
99 |
+
except Exception as e:
|
100 |
+
logger.error(f"Error processing PDF file: {str(e)}")
|
101 |
+
return {
|
102 |
+
"status": "error",
|
103 |
+
"message": f"Failed to extract text from PDF: {str(e)}"
|
104 |
+
}
|
105 |
|
106 |
|
107 |
+
def _structure_resume_text(text: str) -> dict:
|
108 |
"""
|
109 |
+
Structure resume text into logical sections for optimal LLM processing.
|
110 |
|
111 |
Args:
|
112 |
+
text (str): Raw extracted text from PDF
|
113 |
|
114 |
Returns:
|
115 |
+
dict: Structured text with sections, full text, and summary
|
116 |
"""
|
117 |
+
if not text:
|
118 |
+
return {
|
119 |
+
"sections": {},
|
120 |
+
"full_text": "",
|
121 |
+
"llm_formatted": "",
|
122 |
+
"summary": "",
|
123 |
+
"format": "structured_resume",
|
124 |
+
"word_count": 0,
|
125 |
+
"section_count": 0
|
126 |
+
}
|
127 |
+
|
128 |
+
# Clean the text first
|
129 |
+
cleaned_text = _clean_extracted_text(text)
|
130 |
+
|
131 |
+
# Define section patterns (common LinkedIn export sections)
|
132 |
+
section_patterns = {
|
133 |
+
"contact_info": r"(?i)(contact|personal|profile)\s*(?:information)?",
|
134 |
+
"summary": r"(?i)(summary|about|overview|profile)",
|
135 |
+
"experience": r"(?i)(experience|work|employment|professional)",
|
136 |
+
"education": r"(?i)(education|academic|university|college|school)",
|
137 |
+
"skills": r"(?i)(skills|competencies|technologies|technical)",
|
138 |
+
"certifications": r"(?i)(certification|certificate|license)",
|
139 |
+
"projects": r"(?i)(project|portfolio)",
|
140 |
+
"achievements": r"(?i)(achievement|award|honor|recognition)",
|
141 |
+
"languages": r"(?i)(language|linguistic)",
|
142 |
+
"volunteer": r"(?i)(volunteer|community|charity)"
|
143 |
+
}
|
144 |
+
|
145 |
+
# Split text into lines for processing
|
146 |
+
lines = cleaned_text.split('\n')
|
147 |
+
sections = {}
|
148 |
+
current_section = "general"
|
149 |
+
current_content = []
|
150 |
+
|
151 |
+
for line in lines:
|
152 |
+
line = line.strip()
|
153 |
+
if not line:
|
154 |
+
continue
|
155 |
+
|
156 |
+
# Check if line is a section header
|
157 |
+
section_found = None
|
158 |
+
for section_name, pattern in section_patterns.items():
|
159 |
+
if re.match(pattern, line):
|
160 |
+
section_found = section_name
|
161 |
+
break
|
162 |
+
|
163 |
+
if section_found:
|
164 |
+
# Save previous section content
|
165 |
+
if current_content:
|
166 |
+
sections[current_section] = '\n'.join(current_content)
|
167 |
+
|
168 |
+
# Start new section
|
169 |
+
current_section = section_found
|
170 |
+
current_content = [line]
|
171 |
+
else:
|
172 |
+
current_content.append(line)
|
173 |
+
|
174 |
+
# Save the last section
|
175 |
+
if current_content:
|
176 |
+
sections[current_section] = '\n'.join(current_content)
|
177 |
+
|
178 |
+
# Create a structured summary for LLM context
|
179 |
+
summary_parts = []
|
180 |
+
if "contact_info" in sections:
|
181 |
+
summary_parts.append(f"CONTACT: {sections['contact_info'][:200]}...")
|
182 |
+
if "summary" in sections:
|
183 |
+
summary_parts.append(f"SUMMARY: {sections['summary']}")
|
184 |
+
if "experience" in sections:
|
185 |
+
summary_parts.append(f"EXPERIENCE: {sections['experience'][:300]}...")
|
186 |
+
if "education" in sections:
|
187 |
+
summary_parts.append(f"EDUCATION: {sections['education']}")
|
188 |
+
if "skills" in sections:
|
189 |
+
summary_parts.append(f"SKILLS: {sections['skills']}")
|
190 |
+
|
191 |
+
# Create LLM-optimized format
|
192 |
+
llm_formatted_text = _format_for_llm(sections, cleaned_text)
|
193 |
+
|
194 |
+
return {
|
195 |
+
"sections": sections,
|
196 |
+
"full_text": cleaned_text,
|
197 |
+
"llm_formatted": llm_formatted_text,
|
198 |
+
"summary": '\n\n'.join(summary_parts),
|
199 |
+
"format": "structured_resume",
|
200 |
+
"word_count": len(cleaned_text.split()),
|
201 |
+
"section_count": len(sections)
|
202 |
+
}
|
203 |
+
|
204 |
+
|
205 |
+
def _format_for_llm(sections: dict, full_text: str) -> str:
|
206 |
"""
|
207 |
+
Format the resume sections in an optimal way for LLM processing.
|
208 |
|
209 |
Args:
|
210 |
+
sections (dict): Structured sections
|
211 |
+
full_text (str): Full cleaned text
|
212 |
|
213 |
Returns:
|
214 |
+
str: LLM-optimized formatted text
|
215 |
"""
|
216 |
+
formatted_parts = ["=== RESUME CONTENT ===\n"]
|
217 |
+
|
218 |
+
# Prioritize sections in logical order for LLM
|
219 |
+
priority_order = ["summary", "contact_info", "experience", "education", "skills",
|
220 |
+
"certifications", "projects", "achievements", "languages", "volunteer"]
|
221 |
+
|
222 |
+
# Add prioritized sections
|
223 |
+
for section_name in priority_order:
|
224 |
+
if section_name in sections:
|
225 |
+
formatted_parts.append(f"[{section_name.upper().replace('_', ' ')}]")
|
226 |
+
formatted_parts.append(sections[section_name])
|
227 |
+
formatted_parts.append("") # Empty line between sections
|
228 |
+
|
229 |
+
# Add any remaining sections
|
230 |
+
for section_name, content in sections.items():
|
231 |
+
if section_name not in priority_order and section_name != "general":
|
232 |
+
formatted_parts.append(f"[{section_name.upper().replace('_', ' ')}]")
|
233 |
+
formatted_parts.append(content)
|
234 |
+
formatted_parts.append("")
|
235 |
+
|
236 |
+
# Add general content if exists
|
237 |
+
if "general" in sections:
|
238 |
+
formatted_parts.append("[ADDITIONAL INFORMATION]")
|
239 |
+
formatted_parts.append(sections["general"])
|
240 |
+
|
241 |
+
formatted_parts.append("\n=== END RESUME ===")
|
242 |
+
|
243 |
+
return '\n'.join(formatted_parts)
|
244 |
|
|
|
|
|
245 |
|
246 |
+
def _clean_extracted_text(text: str) -> str:
|
247 |
+
"""
|
248 |
+
Clean and normalize extracted text from PDF for better LLM processing.
|
249 |
+
|
250 |
+
Args:
|
251 |
+
text (str): Raw extracted text
|
252 |
+
|
253 |
+
Returns:
|
254 |
+
str: Cleaned text optimized for LLM consumption
|
255 |
+
"""
|
256 |
+
if not text:
|
257 |
return ""
|
258 |
+
|
259 |
+
# Remove excessive whitespace and normalize line endings
|
260 |
+
text = re.sub(r'\r\n', '\n', text)
|
261 |
+
text = re.sub(r'\r', '\n', text)
|
262 |
+
|
263 |
+
# Split into lines and clean each line
|
264 |
+
lines = text.split('\n')
|
265 |
+
cleaned_lines = []
|
266 |
+
|
267 |
+
for line in lines:
|
268 |
+
# Strip whitespace
|
269 |
+
cleaned_line = line.strip()
|
270 |
+
|
271 |
+
# Skip empty lines and very short lines (likely artifacts)
|
272 |
+
if len(cleaned_line) < 2:
|
273 |
+
continue
|
274 |
+
|
275 |
+
# Remove common PDF artifacts
|
276 |
+
cleaned_line = re.sub(r'^\d+$', '', cleaned_line) # Page numbers
|
277 |
+
cleaned_line = re.sub(r'^[|\-_=]+$', '', cleaned_line) # Separator lines
|
278 |
+
|
279 |
+
if cleaned_line:
|
280 |
+
cleaned_lines.append(cleaned_line)
|
281 |
+
|
282 |
+
# Join lines and normalize spacing
|
283 |
+
cleaned_text = '\n'.join(cleaned_lines)
|
284 |
+
|
285 |
+
# Normalize multiple spaces to single spaces
|
286 |
+
cleaned_text = re.sub(r' +', ' ', cleaned_text)
|
287 |
+
|
288 |
+
# Normalize multiple newlines to maximum of 2
|
289 |
+
cleaned_text = re.sub(r'\n{3,}', '\n\n', cleaned_text)
|
290 |
+
|
291 |
+
return cleaned_text.strip()
|
292 |
|
293 |
|
294 |
+
def get_llm_context_from_resume(extraction_result: dict) -> str:
|
295 |
"""
|
296 |
+
Extract the best formatted text for LLM context from the extraction result.
|
297 |
|
298 |
+
Args:
|
299 |
+
extraction_result (dict): Result from extract_text_from_linkedin_pdf
|
300 |
+
|
301 |
Returns:
|
302 |
+
str: Formatted text ready for LLM context
|
303 |
"""
|
304 |
+
if extraction_result.get("status") != "success":
|
305 |
+
return ""
|
306 |
+
|
307 |
+
structured_text = extraction_result.get("structured_text", {})
|
308 |
+
|
309 |
+
# Return the LLM-formatted version if available, otherwise fall back to full text
|
310 |
+
return structured_text.get("llm_formatted", structured_text.get("full_text", ""))
|
|
|
|
|
|
|
|
|
|
|
|
packages.txt
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
chromium
|
|
|
|
requirements.txt
CHANGED
@@ -1,3 +1,2 @@
|
|
1 |
gradio==5.35.0
|
2 |
-
|
3 |
-
webdriver-manager>=3.8.0
|
|
|
1 |
gradio==5.35.0
|
2 |
+
PyPDF2==3.0.1
|
|
resumate.py
CHANGED
@@ -1,10 +1,10 @@
|
|
1 |
"""
|
2 |
resumate.py
|
3 |
|
4 |
-
A simple Gradio UI for collecting user profile and job post
|
5 |
|
6 |
-
This app provides
|
7 |
-
- LinkedIn
|
8 |
- GitHub profile URL
|
9 |
- LinkedIn job post URL
|
10 |
|
@@ -15,39 +15,67 @@ To run:
|
|
15 |
"""
|
16 |
|
17 |
import gradio as gr
|
18 |
-
from functions.context_acquisition import
|
19 |
|
20 |
|
21 |
-
def process_inputs(
|
22 |
"""
|
23 |
-
Process the input
|
24 |
|
25 |
Args:
|
26 |
-
|
27 |
github_url (str): GitHub profile URL
|
28 |
job_post_url (str): LinkedIn job post URL
|
29 |
|
30 |
Returns:
|
31 |
-
str: Formatted output with
|
32 |
"""
|
33 |
-
result =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
result += "
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
|
|
|
|
43 |
|
44 |
return result
|
45 |
|
46 |
with gr.Blocks() as demo:
|
47 |
gr.Markdown("# Resumate: Profile & Job Post Input")
|
48 |
-
|
49 |
-
|
50 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
)
|
52 |
|
53 |
github_profile = gr.Textbox(
|
@@ -61,11 +89,11 @@ with gr.Blocks() as demo:
|
|
61 |
)
|
62 |
|
63 |
submit_btn = gr.Button("Submit")
|
64 |
-
output = gr.Textbox(label="Output", lines=
|
65 |
|
66 |
submit_btn.click( # pylint: disable=no-member
|
67 |
process_inputs,
|
68 |
-
inputs=[
|
69 |
outputs=output
|
70 |
)
|
71 |
|
|
|
1 |
"""
|
2 |
resumate.py
|
3 |
|
4 |
+
A simple Gradio UI for collecting user profile and job post information.
|
5 |
|
6 |
+
This app provides inputs for:
|
7 |
+
- LinkedIn resume export PDF file upload
|
8 |
- GitHub profile URL
|
9 |
- LinkedIn job post URL
|
10 |
|
|
|
15 |
"""
|
16 |
|
17 |
import gradio as gr
|
18 |
+
from functions.context_acquisition import extract_text_from_linkedin_pdf, get_llm_context_from_resume
|
19 |
|
20 |
|
21 |
+
def process_inputs(linkedin_pdf, github_url, job_post_url):
|
22 |
"""
|
23 |
+
Process the input files and URLs.
|
24 |
|
25 |
Args:
|
26 |
+
linkedin_pdf: Uploaded LinkedIn resume export PDF file
|
27 |
github_url (str): GitHub profile URL
|
28 |
job_post_url (str): LinkedIn job post URL
|
29 |
|
30 |
Returns:
|
31 |
+
str: Formatted output with file and URL information
|
32 |
"""
|
33 |
+
result = ""
|
34 |
+
|
35 |
+
# Process LinkedIn PDF file
|
36 |
+
if linkedin_pdf is not None:
|
37 |
+
result += f"✅ LinkedIn Resume PDF uploaded: {linkedin_pdf.name}\n"
|
38 |
+
|
39 |
+
# Extract and structure text from the PDF
|
40 |
+
extraction_result = extract_text_from_linkedin_pdf(linkedin_pdf.name)
|
41 |
+
|
42 |
+
if extraction_result["status"] == "success":
|
43 |
+
structured_text = extraction_result["structured_text"]
|
44 |
+
result += "✅ Text extraction successful\n"
|
45 |
+
result += structured_text["llm_formatted"] + "\n"
|
46 |
|
47 |
+
elif extraction_result["status"] == "warning":
|
48 |
+
result += f"⚠️ Text extraction: {extraction_result['message']}\n\n"
|
49 |
+
else:
|
50 |
+
result += f"❌ Text extraction failed: {extraction_result['message']}\n\n"
|
51 |
+
else:
|
52 |
+
result += "❌ No LinkedIn resume PDF file uploaded\n\n"
|
53 |
+
|
54 |
+
# Process other inputs
|
55 |
+
result += f"GitHub Profile: {github_url if github_url else 'Not provided'}\n"
|
56 |
+
result += f"Job Post URL: {job_post_url if job_post_url else 'Not provided'}\n"
|
57 |
|
58 |
return result
|
59 |
|
60 |
with gr.Blocks() as demo:
|
61 |
gr.Markdown("# Resumate: Profile & Job Post Input")
|
62 |
+
|
63 |
+
gr.Markdown("""
|
64 |
+
## How to Export Your LinkedIn Profile as PDF
|
65 |
+
|
66 |
+
1. **Go to your LinkedIn profile page** (linkedin.com/in/your-profile)
|
67 |
+
2. **Click "More" button** (three dots) in your profile header section
|
68 |
+
3. **Select "Save to PDF"** from the dropdown menu
|
69 |
+
4. **Wait for the download** - LinkedIn will generate and download your profile as a PDF file
|
70 |
+
5. **Upload the downloaded PDF** using the file upload box below
|
71 |
+
|
72 |
+
💡 **Tip**: Make sure your LinkedIn profile is complete and up-to-date before exporting for best results!
|
73 |
+
""")
|
74 |
+
|
75 |
+
linkedin_pdf = gr.File(
|
76 |
+
label="LinkedIn Resume Export PDF",
|
77 |
+
file_types=[".pdf"],
|
78 |
+
file_count="single"
|
79 |
)
|
80 |
|
81 |
github_profile = gr.Textbox(
|
|
|
89 |
)
|
90 |
|
91 |
submit_btn = gr.Button("Submit")
|
92 |
+
output = gr.Textbox(label="Output", lines=20, max_lines=50, show_copy_button=True)
|
93 |
|
94 |
submit_btn.click( # pylint: disable=no-member
|
95 |
process_inputs,
|
96 |
+
inputs=[linkedin_pdf, github_profile, job_post],
|
97 |
outputs=output
|
98 |
)
|
99 |
|
tests/test_context_acquisition.py
CHANGED
@@ -1,252 +1,3 @@
|
|
1 |
"""
|
2 |
Unit tests for the context_acquisition module.
|
3 |
"""
|
4 |
-
|
5 |
-
import unittest
|
6 |
-
import os
|
7 |
-
import tempfile
|
8 |
-
import shutil
|
9 |
-
from selenium.webdriver.chrome.options import Options
|
10 |
-
|
11 |
-
import functions.context_acquisition
|
12 |
-
|
13 |
-
# Import the functions to test
|
14 |
-
from functions.context_acquisition import (
|
15 |
-
_clean_html_content,
|
16 |
-
_save_html_to_file,
|
17 |
-
setup_chrome_driver_options
|
18 |
-
)
|
19 |
-
|
20 |
-
|
21 |
-
class TestCleanHTMLContent(unittest.TestCase):
|
22 |
-
"""Test cases for the _clean_html_content function."""
|
23 |
-
|
24 |
-
def test_remove_blank_lines(self):
|
25 |
-
"""Test removal of blank lines from HTML content."""
|
26 |
-
html_with_blanks = """<html>
|
27 |
-
|
28 |
-
<head>
|
29 |
-
<title>Test</title>
|
30 |
-
|
31 |
-
</head>
|
32 |
-
|
33 |
-
<body>
|
34 |
-
<div>Content</div>
|
35 |
-
|
36 |
-
</body>
|
37 |
-
</html>"""
|
38 |
-
|
39 |
-
expected = """<html>
|
40 |
-
<head>
|
41 |
-
<title>Test</title>
|
42 |
-
</head>
|
43 |
-
<body>
|
44 |
-
<div>Content</div>
|
45 |
-
</body>
|
46 |
-
</html>"""
|
47 |
-
|
48 |
-
result = _clean_html_content(html_with_blanks)
|
49 |
-
self.assertEqual(result, expected)
|
50 |
-
|
51 |
-
def test_strip_trailing_whitespace(self):
|
52 |
-
"""Test removal of trailing whitespace from lines."""
|
53 |
-
html_with_trailing = "<div>Content</div> \n<p>Text</p>\t\n"
|
54 |
-
expected = "<div>Content</div>\n<p>Text</p>"
|
55 |
-
|
56 |
-
result = _clean_html_content(html_with_trailing)
|
57 |
-
self.assertEqual(result, expected)
|
58 |
-
|
59 |
-
def test_empty_content(self):
|
60 |
-
"""Test handling of empty or whitespace-only content."""
|
61 |
-
self.assertEqual(_clean_html_content(""), "")
|
62 |
-
self.assertEqual(_clean_html_content(" \n\n\t "), "")
|
63 |
-
self.assertEqual(_clean_html_content("\n"), "")
|
64 |
-
|
65 |
-
def test_single_line_content(self):
|
66 |
-
"""Test cleaning of single line content."""
|
67 |
-
single_line = "<html><body>Content</body></html>"
|
68 |
-
result = _clean_html_content(single_line)
|
69 |
-
self.assertEqual(result, single_line)
|
70 |
-
|
71 |
-
def test_mixed_whitespace(self):
|
72 |
-
"""Test handling of mixed whitespace characters."""
|
73 |
-
mixed = "<div>\t\n \n\r\n<p>Text</p>\n \n</div>"
|
74 |
-
expected = "<div>\n<p>Text</p>\n</div>"
|
75 |
-
result = _clean_html_content(mixed)
|
76 |
-
self.assertEqual(result, expected)
|
77 |
-
|
78 |
-
|
79 |
-
class TestSaveHTMLToFile(unittest.TestCase):
|
80 |
-
"""Test cases for the _save_html_to_file function."""
|
81 |
-
|
82 |
-
def setUp(self):
|
83 |
-
"""Set up test fixtures with temporary directory."""
|
84 |
-
self.test_dir = tempfile.mkdtemp()
|
85 |
-
self.test_html = "<html><body>Test content</body></html>"
|
86 |
-
self.test_url = "https://www.linkedin.com/in/johndoe"
|
87 |
-
|
88 |
-
def tearDown(self):
|
89 |
-
"""Clean up temporary directory."""
|
90 |
-
if os.path.exists(self.test_dir):
|
91 |
-
shutil.rmtree(self.test_dir)
|
92 |
-
|
93 |
-
def test_successful_file_save(self):
|
94 |
-
"""Test successful saving of HTML content to file."""
|
95 |
-
# Temporarily change the file path calculation
|
96 |
-
original_dirname = os.path.dirname
|
97 |
-
|
98 |
-
def mock_dirname(path):
|
99 |
-
if path.endswith('context_acquisition.py'):
|
100 |
-
return self.test_dir
|
101 |
-
return original_dirname(path)
|
102 |
-
|
103 |
-
# Replace os.path.dirname temporarily
|
104 |
-
original_func = functions.context_acquisition.os.path.dirname
|
105 |
-
functions.context_acquisition.os.path.dirname = mock_dirname
|
106 |
-
|
107 |
-
try:
|
108 |
-
result = _save_html_to_file(self.test_html, self.test_url)
|
109 |
-
|
110 |
-
# Verify file was created
|
111 |
-
self.assertTrue(os.path.exists(result))
|
112 |
-
self.assertTrue(result.endswith('.html'))
|
113 |
-
|
114 |
-
# Verify file content
|
115 |
-
with open(result, 'r', encoding='utf-8') as f:
|
116 |
-
content = f.read()
|
117 |
-
self.assertEqual(content, self.test_html)
|
118 |
-
|
119 |
-
finally:
|
120 |
-
# Restore original function
|
121 |
-
functions.context_acquisition.os.path.dirname = original_func
|
122 |
-
|
123 |
-
|
124 |
-
class TestSetupChromeDriverOptions(unittest.TestCase):
|
125 |
-
"""Test cases for the setup_chrome_driver_options function."""
|
126 |
-
|
127 |
-
def test_chrome_options_configuration(self):
|
128 |
-
"""Test that Chrome options are properly configured."""
|
129 |
-
options = setup_chrome_driver_options()
|
130 |
-
|
131 |
-
# Verify that options object is returned
|
132 |
-
self.assertIsNotNone(options)
|
133 |
-
|
134 |
-
# Verify it's the correct type
|
135 |
-
self.assertIsInstance(options, Options)
|
136 |
-
|
137 |
-
def test_chrome_options_arguments(self):
|
138 |
-
"""Test that required Chrome arguments are set."""
|
139 |
-
options = setup_chrome_driver_options()
|
140 |
-
|
141 |
-
# Access the arguments (this is implementation dependent)
|
142 |
-
# Note: This test verifies the function runs without error
|
143 |
-
# Specific argument verification would require accessing private attributes
|
144 |
-
self.assertIsNotNone(options)
|
145 |
-
|
146 |
-
|
147 |
-
class TestURLValidation(unittest.TestCase):
|
148 |
-
"""Test cases for URL validation logic (extracted from main function)."""
|
149 |
-
|
150 |
-
def test_valid_linkedin_urls(self):
|
151 |
-
"""Test validation of valid LinkedIn URLs."""
|
152 |
-
valid_urls = [
|
153 |
-
"https://www.linkedin.com/in/johndoe",
|
154 |
-
"https://linkedin.com/in/jane-smith",
|
155 |
-
"http://www.linkedin.com/in/test123",
|
156 |
-
"https://www.linkedin.com/in/user-name-with-dashes",
|
157 |
-
]
|
158 |
-
|
159 |
-
for url in valid_urls:
|
160 |
-
# Test the validation logic directly
|
161 |
-
self.assertTrue(isinstance(url, str))
|
162 |
-
self.assertTrue(url.strip())
|
163 |
-
self.assertIn("linkedin.com/in/", url)
|
164 |
-
|
165 |
-
def test_invalid_linkedin_urls(self):
|
166 |
-
"""Test validation of invalid LinkedIn URLs."""
|
167 |
-
invalid_urls = [
|
168 |
-
"",
|
169 |
-
None,
|
170 |
-
"https://www.example.com/profile",
|
171 |
-
"https://www.linkedin.com/company/test",
|
172 |
-
"https://github.com/user",
|
173 |
-
"not-a-url",
|
174 |
-
]
|
175 |
-
|
176 |
-
for url in invalid_urls:
|
177 |
-
# Test the validation logic directly
|
178 |
-
if url is None or not isinstance(url, str):
|
179 |
-
self.assertTrue(url is None or not isinstance(url, str))
|
180 |
-
elif not url.strip():
|
181 |
-
self.assertFalse(url.strip())
|
182 |
-
else:
|
183 |
-
self.assertNotIn("linkedin.com/in/", url)
|
184 |
-
|
185 |
-
|
186 |
-
class TestHTMLContentProcessing(unittest.TestCase):
|
187 |
-
"""Test cases for HTML content processing workflows."""
|
188 |
-
|
189 |
-
def test_html_cleaning_workflow(self):
|
190 |
-
"""Test the complete HTML cleaning workflow."""
|
191 |
-
raw_html = """<!DOCTYPE html>
|
192 |
-
<html>
|
193 |
-
|
194 |
-
<head>
|
195 |
-
<title>LinkedIn Profile</title>
|
196 |
-
|
197 |
-
</head>
|
198 |
-
|
199 |
-
<body>
|
200 |
-
<div class="profile">
|
201 |
-
<h1>John Doe</h1>
|
202 |
-
|
203 |
-
<p>Software Engineer</p>
|
204 |
-
</div>
|
205 |
-
|
206 |
-
</body>
|
207 |
-
|
208 |
-
</html>"""
|
209 |
-
|
210 |
-
cleaned = _clean_html_content(raw_html)
|
211 |
-
|
212 |
-
# Verify no empty lines
|
213 |
-
lines = cleaned.split('\n')
|
214 |
-
for line in lines:
|
215 |
-
self.assertTrue(line.strip(), f"Found empty line: '{line}'")
|
216 |
-
|
217 |
-
# Verify content is preserved
|
218 |
-
self.assertIn("John Doe", cleaned)
|
219 |
-
self.assertIn("Software Engineer", cleaned)
|
220 |
-
self.assertIn("LinkedIn Profile", cleaned)
|
221 |
-
|
222 |
-
def test_minimal_html_cleaning(self):
|
223 |
-
"""Test cleaning of minimal HTML content."""
|
224 |
-
minimal_html = "<html><body>Content</body></html>"
|
225 |
-
result = _clean_html_content(minimal_html)
|
226 |
-
self.assertEqual(result, minimal_html)
|
227 |
-
|
228 |
-
def test_complex_whitespace_patterns(self):
|
229 |
-
"""Test cleaning of complex whitespace patterns."""
|
230 |
-
complex_html = """<div>
|
231 |
-
\t\t
|
232 |
-
<span>Text</span>
|
233 |
-
\t
|
234 |
-
|
235 |
-
<p>Paragraph</p>
|
236 |
-
\t
|
237 |
-
</div>"""
|
238 |
-
|
239 |
-
result = _clean_html_content(complex_html)
|
240 |
-
lines = result.split('\n')
|
241 |
-
|
242 |
-
# Should have no empty lines
|
243 |
-
for line in lines:
|
244 |
-
self.assertTrue(line.strip())
|
245 |
-
|
246 |
-
# Should preserve content
|
247 |
-
self.assertIn("Text", result)
|
248 |
-
self.assertIn("Paragraph", result)
|
249 |
-
|
250 |
-
|
251 |
-
if __name__ == '__main__':
|
252 |
-
unittest.main()
|
|
|
1 |
"""
|
2 |
Unit tests for the context_acquisition module.
|
3 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|