gperdrizet commited on
Commit
cbb592a
·
2 Parent(s): 1225f1c 3e6ea2e

Fixed merge conflict

Browse files
.devcontainer/devcontainer.json CHANGED
@@ -3,7 +3,7 @@
3
  {
4
  "name": "Python 3.10: resumate",
5
  "image": "mcr.microsoft.com/devcontainers/python:0-3.11",
6
- "onCreateCommand": "sudo apt update && sudo apt upgrade -y && sudo apt install -y chromium && pip3 install --upgrade pip && pip3 install --user -r requirements.txt",
7
  "customizations": {
8
  "vscode": {
9
  "extensions": [
 
3
  {
4
  "name": "Python 3.10: resumate",
5
  "image": "mcr.microsoft.com/devcontainers/python:0-3.11",
6
+ "onCreateCommand": "sudo apt update && sudo apt upgrade -y && pip3 install --upgrade pip && pip3 install --user -r requirements.txt",
7
  "customizations": {
8
  "vscode": {
9
  "extensions": [
.gitignore CHANGED
@@ -1,4 +1,3 @@
1
  __pycache__
2
  .vscode
3
- .venv
4
- html
 
1
  __pycache__
2
  .vscode
3
+ .venv
 
functions/__init__.py DELETED
@@ -1,10 +0,0 @@
1
- """
2
- Functions package for the resumate application.
3
-
4
- This package contains modules for data acquisition, processing, and analysis
5
- of LinkedIn profiles, GitHub profiles, and job postings.
6
- """
7
-
8
- from .context_acquisition import get_linkedin_profile_html
9
-
10
- __all__ = ['get_linkedin_profile_html']
 
 
 
 
 
 
 
 
 
 
 
functions/context_acquisition.py CHANGED
@@ -1,210 +1,310 @@
1
  """
2
  context_acquisition.py
3
 
4
- Functions for acquiring context from various sources including LinkedIn profiles,
5
- GitHub profiles, and job postings using browser automation.
6
  """
7
 
8
- import time
9
  import logging
 
10
  import os
11
- from urllib.parse import urlparse
12
-
13
- from selenium import webdriver
14
- from selenium.webdriver.chrome.options import Options
15
- from selenium.webdriver.common.by import By
16
- from selenium.webdriver.support.ui import WebDriverWait
17
- from selenium.webdriver.support import expected_conditions as EC
18
- from selenium.common.exceptions import TimeoutException, WebDriverException
19
 
20
  # Set up logging
21
  logging.basicConfig(level=logging.INFO)
22
  logger = logging.getLogger(__name__)
23
 
24
 
25
- def get_linkedin_profile_html(profile_url: str, wait_time: int = 10) -> str:
26
  """
27
- Retrieve the HTML content of a LinkedIn profile using browser automation.
28
- The HTML content is saved to the html directory and also returned.
29
 
30
  Args:
31
- profile_url (str): The URL of the LinkedIn profile to scrape
32
- wait_time (int): Maximum time to wait for page elements to load (default: 10 seconds)
33
-
34
- Returns:
35
- str: The HTML content of the LinkedIn profile page
36
 
37
- Raises:
38
- ValueError: If the URL is not a valid LinkedIn profile URL
39
- WebDriverException: If there's an issue with the browser automation
40
- TimeoutException: If the page takes too long to load
41
 
42
- Note:
43
- The HTML content is automatically saved to html/linkedin_profile_<name>_<timestamp>.html
 
 
 
 
 
 
 
 
 
44
  """
45
-
46
- # Validate LinkedIn URL
47
- if not profile_url or not isinstance(profile_url, str):
48
- raise ValueError("Profile URL must be a non-empty string")
49
-
50
- if "linkedin.com/in/" not in profile_url:
51
- raise ValueError("URL must be a valid LinkedIn profile URL (containing 'linkedin.com/in/')")
52
-
53
- # Configure Chrome options for headless browsing
54
- chrome_options = setup_chrome_driver_options()
55
-
56
- driver = None
57
  try:
58
- # Initialize the Chrome driver
59
- logger.info("Initializing browser for URL: %s", profile_url)
60
- driver = webdriver.Chrome(options=chrome_options)
61
- driver.set_page_load_timeout(30)
62
-
63
- # Navigate to the LinkedIn profile
64
- logger.info("Navigating to LinkedIn profile...")
65
- driver.get(profile_url)
66
-
67
- # Wait for the page to load
68
- # Look for common LinkedIn profile elements
69
- wait = WebDriverWait(driver, wait_time)
70
-
71
- try:
72
- # Wait for either the main content or login prompt
73
- wait.until(
74
- EC.any_of(
75
- EC.presence_of_element_located(( # Profile header
76
- By.CSS_SELECTOR,
77
- ".pv-top-card"
78
- )),
79
- EC.presence_of_element_located(( # Profile section
80
- By.CSS_SELECTOR,
81
- ".profile-section"
82
- )),
83
- EC.presence_of_element_located(( # Auth wall
84
- By.CSS_SELECTOR,
85
- ".authwall"
86
- )),
87
- EC.presence_of_element_located(( # Public profile
88
- By.CSS_SELECTOR,
89
- ".public-profile"
90
- )),
91
- )
92
- )
93
-
94
- except TimeoutException:
95
- logger.warning(
96
- "Standard LinkedIn elements not found, proceeding with current page state"
97
- )
98
-
99
- # Additional wait to ensure dynamic content loads
100
- time.sleep(2)
101
-
102
- # Get the page HTML
103
- html_content = driver.page_source
104
-
105
- # Clean up HTML by removing blank lines
106
- cleaned_html = _clean_html_content(html_content)
107
-
108
- logger.info(
109
- "Successfully retrieved HTML content (%d characters, cleaned to %d characters)",
110
- len(html_content),
111
- len(cleaned_html)
112
- )
113
-
114
- # Save HTML content to file
115
- _save_html_to_file(cleaned_html, profile_url)
116
-
117
- return cleaned_html
118
-
119
- except WebDriverException as e:
120
- logger.error("WebDriver error occurred: %s", str(e))
121
- raise WebDriverException(f"Browser automation failed: {str(e)}") from e
122
-
123
- except Exception as e:
124
- logger.error("Unexpected error occurred: %s", str(e))
125
- raise RuntimeError(f"Failed to retrieve LinkedIn profile: {str(e)}") from e
126
-
127
- finally:
128
- # Always clean up the driver
129
- if driver:
130
  try:
131
- driver.quit()
132
- logger.info("Browser session closed")
133
- except WebDriverException as e:
134
- logger.warning("Error closing browser: %s", str(e))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
 
136
 
137
- def _clean_html_content(html_content: str) -> str:
138
  """
139
- Clean HTML content by removing blank lines and excessive whitespace.
140
 
141
  Args:
142
- html_content (str): The raw HTML content to clean
143
 
144
  Returns:
145
- str: Cleaned HTML content with blank lines removed
146
  """
147
- # Split into lines, strip whitespace, and filter out empty lines
148
- lines = html_content.split('\n')
149
- cleaned_lines = [line.rstrip() for line in lines if line.strip()]
150
-
151
- # Join back together with single newlines
152
- return '\n'.join(cleaned_lines)
153
-
154
-
155
- def _save_html_to_file(html_content: str, profile_url: str) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
  """
157
- Save HTML content to a file in the html directory.
158
 
159
  Args:
160
- html_content (str): The HTML content to save
161
- profile_url (str): The original profile URL for filename generation
162
 
163
  Returns:
164
- str: The path to the saved file
165
  """
166
- try:
167
- # Create html directory if it doesn't exist
168
- html_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'html')
169
- os.makedirs(html_dir, exist_ok=True)
170
-
171
- # Generate filename from URL and timestamp
172
- parsed_url = urlparse(profile_url)
173
- profile_name = parsed_url.path.split('/')[2] or 'unknown_profile'
174
- filename = f"linkedin_profile_{profile_name}.html"
175
-
176
- # Full file path
177
- file_path = os.path.join(html_dir, filename)
178
-
179
- # Save HTML content
180
- with open(file_path, 'w', encoding='utf-8') as f:
181
- f.write(html_content)
 
 
 
 
 
 
 
 
 
 
 
 
182
 
183
- logger.info("HTML content saved to: %s", file_path)
184
- return file_path
185
 
186
- except Exception as e: # pylint: disable=broad-exception-caught
187
- logger.warning("Failed to save HTML content: %s", str(e))
 
 
 
 
 
 
 
 
 
188
  return ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
 
190
 
191
- def setup_chrome_driver_options() -> Options:
192
  """
193
- Create and configure Chrome driver options for web scraping.
194
 
 
 
 
195
  Returns:
196
- Options: Configured Chrome options object
197
  """
198
- chrome_options = Options()
199
- chrome_options.add_argument("--headless") # Run in background
200
- chrome_options.add_argument("--no-sandbox")
201
- chrome_options.add_argument("--disable-dev-shm-usage")
202
- chrome_options.add_argument("--disable-gpu")
203
- chrome_options.add_argument("--window-size=1920,1080")
204
- chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) " +
205
- "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
206
- chrome_options.add_argument("--disable-blink-features=AutomationControlled")
207
- chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
208
- chrome_options.add_experimental_option('useAutomationExtension', False)
209
-
210
- return chrome_options
 
1
  """
2
  context_acquisition.py
3
 
4
+ Functions for acquiring context from various sources including PDF text extraction,
5
+ GitHub profiles, and job posting text.
6
  """
7
 
8
+ import re
9
  import logging
10
+ import io
11
  import os
12
+ import PyPDF2
 
 
 
 
 
 
 
13
 
14
  # Set up logging
15
  logging.basicConfig(level=logging.INFO)
16
  logger = logging.getLogger(__name__)
17
 
18
 
19
+ def extract_text_from_linkedin_pdf(pdf_file) -> dict:
20
  """
21
+ Extract and structure text content from an uploaded LinkedIn resume export PDF file
22
+ for optimal LLM processing.
23
 
24
  Args:
25
+ pdf_file: The file path string to the uploaded PDF file
 
 
 
 
26
 
27
+ Returns:
28
+ dict: Dictionary containing extraction status, structured text content, and metadata
 
 
29
 
30
+ Example:
31
+ {
32
+ "status": "success",
33
+ "structured_text": {
34
+ "sections": {...},
35
+ "full_text": "...",
36
+ "llm_formatted": "...",
37
+ "summary": "..."
38
+ },
39
+ "metadata": {...}
40
+ }
41
  """
42
+ if pdf_file is None:
43
+ return {"status": "error", "message": "No PDF file provided"}
44
+
 
 
 
 
 
 
 
 
 
45
  try:
46
+ # Get filename from path
47
+ filename = os.path.basename(pdf_file)
48
+
49
+ # Read the PDF file from the file path
50
+ with open(pdf_file, 'rb') as file:
51
+ file_content = file.read()
52
+ file_size = len(file_content)
53
+
54
+ # Create PDF reader from the file content
55
+ pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_content))
56
+
57
+ # Extract text from all pages
58
+ extracted_text = ""
59
+ num_pages = len(pdf_reader.pages)
60
+
61
+ for page_num in range(num_pages):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  try:
63
+ page = pdf_reader.pages[page_num]
64
+ page_text = page.extract_text()
65
+ extracted_text += page_text + "\n\n"
66
+ except Exception as e:
67
+ logger.warning(f"Error extracting text from page {page_num + 1}: {str(e)}")
68
+ continue
69
+
70
+ # Clean and structure the extracted text for LLM consumption
71
+ structured_content = _structure_resume_text(extracted_text)
72
+
73
+ if not structured_content["full_text"].strip():
74
+ return {
75
+ "status": "warning",
76
+ "structured_text": structured_content,
77
+ "metadata": {
78
+ "filename": filename,
79
+ "file_size": file_size,
80
+ "pages": num_pages
81
+ },
82
+ "message": "PDF processed but no text content was extracted"
83
+ }
84
+
85
+ logger.info(f"Successfully extracted and structured {len(structured_content['full_text'])} characters from {filename}")
86
+
87
+ return {
88
+ "status": "success",
89
+ "structured_text": structured_content,
90
+ "metadata": {
91
+ "filename": filename,
92
+ "file_size": file_size,
93
+ "pages": num_pages,
94
+ "sections_found": list(structured_content["sections"].keys())
95
+ },
96
+ "message": f"Text extracted and structured successfully from {num_pages} pages"
97
+ }
98
+
99
+ except Exception as e:
100
+ logger.error(f"Error processing PDF file: {str(e)}")
101
+ return {
102
+ "status": "error",
103
+ "message": f"Failed to extract text from PDF: {str(e)}"
104
+ }
105
 
106
 
107
+ def _structure_resume_text(text: str) -> dict:
108
  """
109
+ Structure resume text into logical sections for optimal LLM processing.
110
 
111
  Args:
112
+ text (str): Raw extracted text from PDF
113
 
114
  Returns:
115
+ dict: Structured text with sections, full text, and summary
116
  """
117
+ if not text:
118
+ return {
119
+ "sections": {},
120
+ "full_text": "",
121
+ "llm_formatted": "",
122
+ "summary": "",
123
+ "format": "structured_resume",
124
+ "word_count": 0,
125
+ "section_count": 0
126
+ }
127
+
128
+ # Clean the text first
129
+ cleaned_text = _clean_extracted_text(text)
130
+
131
+ # Define section patterns (common LinkedIn export sections)
132
+ section_patterns = {
133
+ "contact_info": r"(?i)(contact|personal|profile)\s*(?:information)?",
134
+ "summary": r"(?i)(summary|about|overview|profile)",
135
+ "experience": r"(?i)(experience|work|employment|professional)",
136
+ "education": r"(?i)(education|academic|university|college|school)",
137
+ "skills": r"(?i)(skills|competencies|technologies|technical)",
138
+ "certifications": r"(?i)(certification|certificate|license)",
139
+ "projects": r"(?i)(project|portfolio)",
140
+ "achievements": r"(?i)(achievement|award|honor|recognition)",
141
+ "languages": r"(?i)(language|linguistic)",
142
+ "volunteer": r"(?i)(volunteer|community|charity)"
143
+ }
144
+
145
+ # Split text into lines for processing
146
+ lines = cleaned_text.split('\n')
147
+ sections = {}
148
+ current_section = "general"
149
+ current_content = []
150
+
151
+ for line in lines:
152
+ line = line.strip()
153
+ if not line:
154
+ continue
155
+
156
+ # Check if line is a section header
157
+ section_found = None
158
+ for section_name, pattern in section_patterns.items():
159
+ if re.match(pattern, line):
160
+ section_found = section_name
161
+ break
162
+
163
+ if section_found:
164
+ # Save previous section content
165
+ if current_content:
166
+ sections[current_section] = '\n'.join(current_content)
167
+
168
+ # Start new section
169
+ current_section = section_found
170
+ current_content = [line]
171
+ else:
172
+ current_content.append(line)
173
+
174
+ # Save the last section
175
+ if current_content:
176
+ sections[current_section] = '\n'.join(current_content)
177
+
178
+ # Create a structured summary for LLM context
179
+ summary_parts = []
180
+ if "contact_info" in sections:
181
+ summary_parts.append(f"CONTACT: {sections['contact_info'][:200]}...")
182
+ if "summary" in sections:
183
+ summary_parts.append(f"SUMMARY: {sections['summary']}")
184
+ if "experience" in sections:
185
+ summary_parts.append(f"EXPERIENCE: {sections['experience'][:300]}...")
186
+ if "education" in sections:
187
+ summary_parts.append(f"EDUCATION: {sections['education']}")
188
+ if "skills" in sections:
189
+ summary_parts.append(f"SKILLS: {sections['skills']}")
190
+
191
+ # Create LLM-optimized format
192
+ llm_formatted_text = _format_for_llm(sections, cleaned_text)
193
+
194
+ return {
195
+ "sections": sections,
196
+ "full_text": cleaned_text,
197
+ "llm_formatted": llm_formatted_text,
198
+ "summary": '\n\n'.join(summary_parts),
199
+ "format": "structured_resume",
200
+ "word_count": len(cleaned_text.split()),
201
+ "section_count": len(sections)
202
+ }
203
+
204
+
205
+ def _format_for_llm(sections: dict, full_text: str) -> str:
206
  """
207
+ Format the resume sections in an optimal way for LLM processing.
208
 
209
  Args:
210
+ sections (dict): Structured sections
211
+ full_text (str): Full cleaned text
212
 
213
  Returns:
214
+ str: LLM-optimized formatted text
215
  """
216
+ formatted_parts = ["=== RESUME CONTENT ===\n"]
217
+
218
+ # Prioritize sections in logical order for LLM
219
+ priority_order = ["summary", "contact_info", "experience", "education", "skills",
220
+ "certifications", "projects", "achievements", "languages", "volunteer"]
221
+
222
+ # Add prioritized sections
223
+ for section_name in priority_order:
224
+ if section_name in sections:
225
+ formatted_parts.append(f"[{section_name.upper().replace('_', ' ')}]")
226
+ formatted_parts.append(sections[section_name])
227
+ formatted_parts.append("") # Empty line between sections
228
+
229
+ # Add any remaining sections
230
+ for section_name, content in sections.items():
231
+ if section_name not in priority_order and section_name != "general":
232
+ formatted_parts.append(f"[{section_name.upper().replace('_', ' ')}]")
233
+ formatted_parts.append(content)
234
+ formatted_parts.append("")
235
+
236
+ # Add general content if exists
237
+ if "general" in sections:
238
+ formatted_parts.append("[ADDITIONAL INFORMATION]")
239
+ formatted_parts.append(sections["general"])
240
+
241
+ formatted_parts.append("\n=== END RESUME ===")
242
+
243
+ return '\n'.join(formatted_parts)
244
 
 
 
245
 
246
+ def _clean_extracted_text(text: str) -> str:
247
+ """
248
+ Clean and normalize extracted text from PDF for better LLM processing.
249
+
250
+ Args:
251
+ text (str): Raw extracted text
252
+
253
+ Returns:
254
+ str: Cleaned text optimized for LLM consumption
255
+ """
256
+ if not text:
257
  return ""
258
+
259
+ # Remove excessive whitespace and normalize line endings
260
+ text = re.sub(r'\r\n', '\n', text)
261
+ text = re.sub(r'\r', '\n', text)
262
+
263
+ # Split into lines and clean each line
264
+ lines = text.split('\n')
265
+ cleaned_lines = []
266
+
267
+ for line in lines:
268
+ # Strip whitespace
269
+ cleaned_line = line.strip()
270
+
271
+ # Skip empty lines and very short lines (likely artifacts)
272
+ if len(cleaned_line) < 2:
273
+ continue
274
+
275
+ # Remove common PDF artifacts
276
+ cleaned_line = re.sub(r'^\d+$', '', cleaned_line) # Page numbers
277
+ cleaned_line = re.sub(r'^[|\-_=]+$', '', cleaned_line) # Separator lines
278
+
279
+ if cleaned_line:
280
+ cleaned_lines.append(cleaned_line)
281
+
282
+ # Join lines and normalize spacing
283
+ cleaned_text = '\n'.join(cleaned_lines)
284
+
285
+ # Normalize multiple spaces to single spaces
286
+ cleaned_text = re.sub(r' +', ' ', cleaned_text)
287
+
288
+ # Normalize multiple newlines to maximum of 2
289
+ cleaned_text = re.sub(r'\n{3,}', '\n\n', cleaned_text)
290
+
291
+ return cleaned_text.strip()
292
 
293
 
294
+ def get_llm_context_from_resume(extraction_result: dict) -> str:
295
  """
296
+ Extract the best formatted text for LLM context from the extraction result.
297
 
298
+ Args:
299
+ extraction_result (dict): Result from extract_text_from_linkedin_pdf
300
+
301
  Returns:
302
+ str: Formatted text ready for LLM context
303
  """
304
+ if extraction_result.get("status") != "success":
305
+ return ""
306
+
307
+ structured_text = extraction_result.get("structured_text", {})
308
+
309
+ # Return the LLM-formatted version if available, otherwise fall back to full text
310
+ return structured_text.get("llm_formatted", structured_text.get("full_text", ""))
 
 
 
 
 
 
packages.txt DELETED
@@ -1 +0,0 @@
1
- chromium
 
 
requirements.txt CHANGED
@@ -1,3 +1,2 @@
1
  gradio==5.35.0
2
- selenium>=4.0.0
3
- webdriver-manager>=3.8.0
 
1
  gradio==5.35.0
2
+ PyPDF2==3.0.1
 
resumate.py CHANGED
@@ -1,10 +1,10 @@
1
  """
2
  resumate.py
3
 
4
- A simple Gradio UI for collecting user profile and job post URLs.
5
 
6
- This app provides three text input fields for:
7
- - LinkedIn profile URL
8
  - GitHub profile URL
9
  - LinkedIn job post URL
10
 
@@ -15,39 +15,67 @@ To run:
15
  """
16
 
17
  import gradio as gr
18
- from functions.context_acquisition import get_linkedin_profile_html
19
 
20
 
21
- def process_inputs(linkedin_url, github_url, job_post_url):
22
  """
23
- Process the input URLs and retrieve content from LinkedIn profile.
24
 
25
  Args:
26
- linkedin_url (str): LinkedIn profile URL
27
  github_url (str): GitHub profile URL
28
  job_post_url (str): LinkedIn job post URL
29
 
30
  Returns:
31
- str: Formatted output with URL information and LinkedIn profile status
32
  """
33
- result = f"LinkedIn: {linkedin_url}\nGitHub: {github_url}\nJob Post: {job_post_url}\n\n"
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
- # Try to retrieve LinkedIn profile HTML if URL is provided
36
- if linkedin_url and linkedin_url.strip():
37
- try:
38
- result += "Attempting to retrieve LinkedIn profile...\n"
39
- html_content = get_linkedin_profile_html(linkedin_url)
40
- result += f"LinkedIn profile HTML ({len(html_content)} characters)\n"
41
- except Exception as e: # pylint: disable=broad-exception-caught
42
- result += f"❌ Failed to retrieve LinkedIn profile: {str(e)}\n"
 
 
43
 
44
  return result
45
 
46
  with gr.Blocks() as demo:
47
  gr.Markdown("# Resumate: Profile & Job Post Input")
48
- linkedin_profile = gr.Textbox(
49
- label="LinkedIn Profile URL",
50
- placeholder="Enter your LinkedIn profile URL"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  )
52
 
53
  github_profile = gr.Textbox(
@@ -61,11 +89,11 @@ with gr.Blocks() as demo:
61
  )
62
 
63
  submit_btn = gr.Button("Submit")
64
- output = gr.Textbox(label="Output", lines=3)
65
 
66
  submit_btn.click( # pylint: disable=no-member
67
  process_inputs,
68
- inputs=[linkedin_profile, github_profile, job_post],
69
  outputs=output
70
  )
71
 
 
1
  """
2
  resumate.py
3
 
4
+ A simple Gradio UI for collecting user profile and job post information.
5
 
6
+ This app provides inputs for:
7
+ - LinkedIn resume export PDF file upload
8
  - GitHub profile URL
9
  - LinkedIn job post URL
10
 
 
15
  """
16
 
17
  import gradio as gr
18
+ from functions.context_acquisition import extract_text_from_linkedin_pdf, get_llm_context_from_resume
19
 
20
 
21
+ def process_inputs(linkedin_pdf, github_url, job_post_url):
22
  """
23
+ Process the input files and URLs.
24
 
25
  Args:
26
+ linkedin_pdf: Uploaded LinkedIn resume export PDF file
27
  github_url (str): GitHub profile URL
28
  job_post_url (str): LinkedIn job post URL
29
 
30
  Returns:
31
+ str: Formatted output with file and URL information
32
  """
33
+ result = ""
34
+
35
+ # Process LinkedIn PDF file
36
+ if linkedin_pdf is not None:
37
+ result += f"✅ LinkedIn Resume PDF uploaded: {linkedin_pdf.name}\n"
38
+
39
+ # Extract and structure text from the PDF
40
+ extraction_result = extract_text_from_linkedin_pdf(linkedin_pdf.name)
41
+
42
+ if extraction_result["status"] == "success":
43
+ structured_text = extraction_result["structured_text"]
44
+ result += "✅ Text extraction successful\n"
45
+ result += structured_text["llm_formatted"] + "\n"
46
 
47
+ elif extraction_result["status"] == "warning":
48
+ result += f"⚠️ Text extraction: {extraction_result['message']}\n\n"
49
+ else:
50
+ result += f" Text extraction failed: {extraction_result['message']}\n\n"
51
+ else:
52
+ result += "❌ No LinkedIn resume PDF file uploaded\n\n"
53
+
54
+ # Process other inputs
55
+ result += f"GitHub Profile: {github_url if github_url else 'Not provided'}\n"
56
+ result += f"Job Post URL: {job_post_url if job_post_url else 'Not provided'}\n"
57
 
58
  return result
59
 
60
  with gr.Blocks() as demo:
61
  gr.Markdown("# Resumate: Profile & Job Post Input")
62
+
63
+ gr.Markdown("""
64
+ ## How to Export Your LinkedIn Profile as PDF
65
+
66
+ 1. **Go to your LinkedIn profile page** (linkedin.com/in/your-profile)
67
+ 2. **Click "More" button** (three dots) in your profile header section
68
+ 3. **Select "Save to PDF"** from the dropdown menu
69
+ 4. **Wait for the download** - LinkedIn will generate and download your profile as a PDF file
70
+ 5. **Upload the downloaded PDF** using the file upload box below
71
+
72
+ 💡 **Tip**: Make sure your LinkedIn profile is complete and up-to-date before exporting for best results!
73
+ """)
74
+
75
+ linkedin_pdf = gr.File(
76
+ label="LinkedIn Resume Export PDF",
77
+ file_types=[".pdf"],
78
+ file_count="single"
79
  )
80
 
81
  github_profile = gr.Textbox(
 
89
  )
90
 
91
  submit_btn = gr.Button("Submit")
92
+ output = gr.Textbox(label="Output", lines=20, max_lines=50, show_copy_button=True)
93
 
94
  submit_btn.click( # pylint: disable=no-member
95
  process_inputs,
96
+ inputs=[linkedin_pdf, github_profile, job_post],
97
  outputs=output
98
  )
99
 
tests/test_context_acquisition.py CHANGED
@@ -1,252 +1,3 @@
1
  """
2
  Unit tests for the context_acquisition module.
3
  """
4
-
5
- import unittest
6
- import os
7
- import tempfile
8
- import shutil
9
- from selenium.webdriver.chrome.options import Options
10
-
11
- import functions.context_acquisition
12
-
13
- # Import the functions to test
14
- from functions.context_acquisition import (
15
- _clean_html_content,
16
- _save_html_to_file,
17
- setup_chrome_driver_options
18
- )
19
-
20
-
21
- class TestCleanHTMLContent(unittest.TestCase):
22
- """Test cases for the _clean_html_content function."""
23
-
24
- def test_remove_blank_lines(self):
25
- """Test removal of blank lines from HTML content."""
26
- html_with_blanks = """<html>
27
-
28
- <head>
29
- <title>Test</title>
30
-
31
- </head>
32
-
33
- <body>
34
- <div>Content</div>
35
-
36
- </body>
37
- </html>"""
38
-
39
- expected = """<html>
40
- <head>
41
- <title>Test</title>
42
- </head>
43
- <body>
44
- <div>Content</div>
45
- </body>
46
- </html>"""
47
-
48
- result = _clean_html_content(html_with_blanks)
49
- self.assertEqual(result, expected)
50
-
51
- def test_strip_trailing_whitespace(self):
52
- """Test removal of trailing whitespace from lines."""
53
- html_with_trailing = "<div>Content</div> \n<p>Text</p>\t\n"
54
- expected = "<div>Content</div>\n<p>Text</p>"
55
-
56
- result = _clean_html_content(html_with_trailing)
57
- self.assertEqual(result, expected)
58
-
59
- def test_empty_content(self):
60
- """Test handling of empty or whitespace-only content."""
61
- self.assertEqual(_clean_html_content(""), "")
62
- self.assertEqual(_clean_html_content(" \n\n\t "), "")
63
- self.assertEqual(_clean_html_content("\n"), "")
64
-
65
- def test_single_line_content(self):
66
- """Test cleaning of single line content."""
67
- single_line = "<html><body>Content</body></html>"
68
- result = _clean_html_content(single_line)
69
- self.assertEqual(result, single_line)
70
-
71
- def test_mixed_whitespace(self):
72
- """Test handling of mixed whitespace characters."""
73
- mixed = "<div>\t\n \n\r\n<p>Text</p>\n \n</div>"
74
- expected = "<div>\n<p>Text</p>\n</div>"
75
- result = _clean_html_content(mixed)
76
- self.assertEqual(result, expected)
77
-
78
-
79
- class TestSaveHTMLToFile(unittest.TestCase):
80
- """Test cases for the _save_html_to_file function."""
81
-
82
- def setUp(self):
83
- """Set up test fixtures with temporary directory."""
84
- self.test_dir = tempfile.mkdtemp()
85
- self.test_html = "<html><body>Test content</body></html>"
86
- self.test_url = "https://www.linkedin.com/in/johndoe"
87
-
88
- def tearDown(self):
89
- """Clean up temporary directory."""
90
- if os.path.exists(self.test_dir):
91
- shutil.rmtree(self.test_dir)
92
-
93
- def test_successful_file_save(self):
94
- """Test successful saving of HTML content to file."""
95
- # Temporarily change the file path calculation
96
- original_dirname = os.path.dirname
97
-
98
- def mock_dirname(path):
99
- if path.endswith('context_acquisition.py'):
100
- return self.test_dir
101
- return original_dirname(path)
102
-
103
- # Replace os.path.dirname temporarily
104
- original_func = functions.context_acquisition.os.path.dirname
105
- functions.context_acquisition.os.path.dirname = mock_dirname
106
-
107
- try:
108
- result = _save_html_to_file(self.test_html, self.test_url)
109
-
110
- # Verify file was created
111
- self.assertTrue(os.path.exists(result))
112
- self.assertTrue(result.endswith('.html'))
113
-
114
- # Verify file content
115
- with open(result, 'r', encoding='utf-8') as f:
116
- content = f.read()
117
- self.assertEqual(content, self.test_html)
118
-
119
- finally:
120
- # Restore original function
121
- functions.context_acquisition.os.path.dirname = original_func
122
-
123
-
124
- class TestSetupChromeDriverOptions(unittest.TestCase):
125
- """Test cases for the setup_chrome_driver_options function."""
126
-
127
- def test_chrome_options_configuration(self):
128
- """Test that Chrome options are properly configured."""
129
- options = setup_chrome_driver_options()
130
-
131
- # Verify that options object is returned
132
- self.assertIsNotNone(options)
133
-
134
- # Verify it's the correct type
135
- self.assertIsInstance(options, Options)
136
-
137
- def test_chrome_options_arguments(self):
138
- """Test that required Chrome arguments are set."""
139
- options = setup_chrome_driver_options()
140
-
141
- # Access the arguments (this is implementation dependent)
142
- # Note: This test verifies the function runs without error
143
- # Specific argument verification would require accessing private attributes
144
- self.assertIsNotNone(options)
145
-
146
-
147
- class TestURLValidation(unittest.TestCase):
148
- """Test cases for URL validation logic (extracted from main function)."""
149
-
150
- def test_valid_linkedin_urls(self):
151
- """Test validation of valid LinkedIn URLs."""
152
- valid_urls = [
153
- "https://www.linkedin.com/in/johndoe",
154
- "https://linkedin.com/in/jane-smith",
155
- "http://www.linkedin.com/in/test123",
156
- "https://www.linkedin.com/in/user-name-with-dashes",
157
- ]
158
-
159
- for url in valid_urls:
160
- # Test the validation logic directly
161
- self.assertTrue(isinstance(url, str))
162
- self.assertTrue(url.strip())
163
- self.assertIn("linkedin.com/in/", url)
164
-
165
- def test_invalid_linkedin_urls(self):
166
- """Test validation of invalid LinkedIn URLs."""
167
- invalid_urls = [
168
- "",
169
- None,
170
- "https://www.example.com/profile",
171
- "https://www.linkedin.com/company/test",
172
- "https://github.com/user",
173
- "not-a-url",
174
- ]
175
-
176
- for url in invalid_urls:
177
- # Test the validation logic directly
178
- if url is None or not isinstance(url, str):
179
- self.assertTrue(url is None or not isinstance(url, str))
180
- elif not url.strip():
181
- self.assertFalse(url.strip())
182
- else:
183
- self.assertNotIn("linkedin.com/in/", url)
184
-
185
-
186
- class TestHTMLContentProcessing(unittest.TestCase):
187
- """Test cases for HTML content processing workflows."""
188
-
189
- def test_html_cleaning_workflow(self):
190
- """Test the complete HTML cleaning workflow."""
191
- raw_html = """<!DOCTYPE html>
192
- <html>
193
-
194
- <head>
195
- <title>LinkedIn Profile</title>
196
-
197
- </head>
198
-
199
- <body>
200
- <div class="profile">
201
- <h1>John Doe</h1>
202
-
203
- <p>Software Engineer</p>
204
- </div>
205
-
206
- </body>
207
-
208
- </html>"""
209
-
210
- cleaned = _clean_html_content(raw_html)
211
-
212
- # Verify no empty lines
213
- lines = cleaned.split('\n')
214
- for line in lines:
215
- self.assertTrue(line.strip(), f"Found empty line: '{line}'")
216
-
217
- # Verify content is preserved
218
- self.assertIn("John Doe", cleaned)
219
- self.assertIn("Software Engineer", cleaned)
220
- self.assertIn("LinkedIn Profile", cleaned)
221
-
222
- def test_minimal_html_cleaning(self):
223
- """Test cleaning of minimal HTML content."""
224
- minimal_html = "<html><body>Content</body></html>"
225
- result = _clean_html_content(minimal_html)
226
- self.assertEqual(result, minimal_html)
227
-
228
- def test_complex_whitespace_patterns(self):
229
- """Test cleaning of complex whitespace patterns."""
230
- complex_html = """<div>
231
- \t\t
232
- <span>Text</span>
233
- \t
234
-
235
- <p>Paragraph</p>
236
- \t
237
- </div>"""
238
-
239
- result = _clean_html_content(complex_html)
240
- lines = result.split('\n')
241
-
242
- # Should have no empty lines
243
- for line in lines:
244
- self.assertTrue(line.strip())
245
-
246
- # Should preserve content
247
- self.assertIn("Text", result)
248
- self.assertIn("Paragraph", result)
249
-
250
-
251
- if __name__ == '__main__':
252
- unittest.main()
 
1
  """
2
  Unit tests for the context_acquisition module.
3
  """