gperdrizet commited on
Commit
68bd3e0
·
1 Parent(s): 0195b9e

Ditched idea to scrape biographic context from public LinkedIn profile. Will have users export profile and upload instead.

Browse files
.devcontainer/devcontainer.json CHANGED
@@ -3,7 +3,7 @@
3
  {
4
  "name": "Python 3.10: resumate",
5
  "image": "mcr.microsoft.com/devcontainers/python:0-3.11",
6
- "onCreateCommand": "sudo apt update && sudo apt upgrade -y && sudo apt install -y chromium && pip3 install --upgrade pip && pip3 install --user -r requirements.txt",
7
  "customizations": {
8
  "vscode": {
9
  "extensions": [
 
3
  {
4
  "name": "Python 3.10: resumate",
5
  "image": "mcr.microsoft.com/devcontainers/python:0-3.11",
6
+ "onCreateCommand": "sudo apt update && sudo apt upgrade -y && pip3 install --upgrade pip && pip3 install --user -r requirements.txt",
7
  "customizations": {
8
  "vscode": {
9
  "extensions": [
.gitignore CHANGED
@@ -1,4 +1,3 @@
1
  __pycache__
2
  .vscode
3
- .venv
4
- html
 
1
  __pycache__
2
  .vscode
3
+ .venv
 
functions/__init__.py DELETED
@@ -1,10 +0,0 @@
1
- """
2
- Functions package for the resumate application.
3
-
4
- This package contains modules for data acquisition, processing, and analysis
5
- of LinkedIn profiles, GitHub profiles, and job postings.
6
- """
7
-
8
- from .context_acquisition import get_linkedin_profile_html
9
-
10
- __all__ = ['get_linkedin_profile_html']
 
 
 
 
 
 
 
 
 
 
 
functions/context_acquisition.py CHANGED
@@ -1,210 +1,3 @@
1
  """
2
- context_acquisition.py
3
-
4
- Functions for acquiring context from various sources including LinkedIn profiles,
5
- GitHub profiles, and job postings using browser automation.
6
- """
7
-
8
- import time
9
- import logging
10
- import os
11
- from urllib.parse import urlparse
12
-
13
- from selenium import webdriver
14
- from selenium.webdriver.chrome.options import Options
15
- from selenium.webdriver.common.by import By
16
- from selenium.webdriver.support.ui import WebDriverWait
17
- from selenium.webdriver.support import expected_conditions as EC
18
- from selenium.common.exceptions import TimeoutException, WebDriverException
19
-
20
- # Set up logging
21
- logging.basicConfig(level=logging.INFO)
22
- logger = logging.getLogger(__name__)
23
-
24
-
25
- def get_linkedin_profile_html(profile_url: str, wait_time: int = 10) -> str:
26
- """
27
- Retrieve the HTML content of a LinkedIn profile using browser automation.
28
- The HTML content is saved to the html directory and also returned.
29
-
30
- Args:
31
- profile_url (str): The URL of the LinkedIn profile to scrape
32
- wait_time (int): Maximum time to wait for page elements to load (default: 10 seconds)
33
-
34
- Returns:
35
- str: The HTML content of the LinkedIn profile page
36
-
37
- Raises:
38
- ValueError: If the URL is not a valid LinkedIn profile URL
39
- WebDriverException: If there's an issue with the browser automation
40
- TimeoutException: If the page takes too long to load
41
-
42
- Note:
43
- The HTML content is automatically saved to html/linkedin_profile_<name>_<timestamp>.html
44
- """
45
-
46
- # Validate LinkedIn URL
47
- if not profile_url or not isinstance(profile_url, str):
48
- raise ValueError("Profile URL must be a non-empty string")
49
-
50
- if "linkedin.com/in/" not in profile_url:
51
- raise ValueError("URL must be a valid LinkedIn profile URL (containing 'linkedin.com/in/')")
52
-
53
- # Configure Chrome options for headless browsing
54
- chrome_options = setup_chrome_driver_options()
55
-
56
- driver = None
57
- try:
58
- # Initialize the Chrome driver
59
- logger.info("Initializing browser for URL: %s", profile_url)
60
- driver = webdriver.Chrome(options=chrome_options)
61
- driver.set_page_load_timeout(30)
62
-
63
- # Navigate to the LinkedIn profile
64
- logger.info("Navigating to LinkedIn profile...")
65
- driver.get(profile_url)
66
-
67
- # Wait for the page to load
68
- # Look for common LinkedIn profile elements
69
- wait = WebDriverWait(driver, wait_time)
70
-
71
- try:
72
- # Wait for either the main content or login prompt
73
- wait.until(
74
- EC.any_of(
75
- EC.presence_of_element_located(( # Profile header
76
- By.CSS_SELECTOR,
77
- ".pv-top-card"
78
- )),
79
- EC.presence_of_element_located(( # Profile section
80
- By.CSS_SELECTOR,
81
- ".profile-section"
82
- )),
83
- EC.presence_of_element_located(( # Auth wall
84
- By.CSS_SELECTOR,
85
- ".authwall"
86
- )),
87
- EC.presence_of_element_located(( # Public profile
88
- By.CSS_SELECTOR,
89
- ".public-profile"
90
- )),
91
- )
92
- )
93
-
94
- except TimeoutException:
95
- logger.warning(
96
- "Standard LinkedIn elements not found, proceeding with current page state"
97
- )
98
-
99
- # Additional wait to ensure dynamic content loads
100
- time.sleep(2)
101
-
102
- # Get the page HTML
103
- html_content = driver.page_source
104
-
105
- # Clean up HTML by removing blank lines
106
- cleaned_html = _clean_html_content(html_content)
107
-
108
- logger.info(
109
- "Successfully retrieved HTML content (%d characters, cleaned to %d characters)",
110
- len(html_content),
111
- len(cleaned_html)
112
- )
113
-
114
- # Save HTML content to file
115
- _save_html_to_file(cleaned_html, profile_url)
116
-
117
- return cleaned_html
118
-
119
- except WebDriverException as e:
120
- logger.error("WebDriver error occurred: %s", str(e))
121
- raise WebDriverException(f"Browser automation failed: {str(e)}") from e
122
-
123
- except Exception as e:
124
- logger.error("Unexpected error occurred: %s", str(e))
125
- raise RuntimeError(f"Failed to retrieve LinkedIn profile: {str(e)}") from e
126
-
127
- finally:
128
- # Always clean up the driver
129
- if driver:
130
- try:
131
- driver.quit()
132
- logger.info("Browser session closed")
133
- except WebDriverException as e:
134
- logger.warning("Error closing browser: %s", str(e))
135
-
136
-
137
- def _clean_html_content(html_content: str) -> str:
138
- """
139
- Clean HTML content by removing blank lines and excessive whitespace.
140
-
141
- Args:
142
- html_content (str): The raw HTML content to clean
143
-
144
- Returns:
145
- str: Cleaned HTML content with blank lines removed
146
- """
147
- # Split into lines, strip whitespace, and filter out empty lines
148
- lines = html_content.split('\n')
149
- cleaned_lines = [line.rstrip() for line in lines if line.strip()]
150
-
151
- # Join back together with single newlines
152
- return '\n'.join(cleaned_lines)
153
-
154
-
155
- def _save_html_to_file(html_content: str, profile_url: str) -> str:
156
- """
157
- Save HTML content to a file in the html directory.
158
-
159
- Args:
160
- html_content (str): The HTML content to save
161
- profile_url (str): The original profile URL for filename generation
162
-
163
- Returns:
164
- str: The path to the saved file
165
- """
166
- try:
167
- # Create html directory if it doesn't exist
168
- html_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'html')
169
- os.makedirs(html_dir, exist_ok=True)
170
-
171
- # Generate filename from URL and timestamp
172
- parsed_url = urlparse(profile_url)
173
- profile_name = parsed_url.path.split('/')[2] or 'unknown_profile'
174
- filename = f"linkedin_profile_{profile_name}.html"
175
-
176
- # Full file path
177
- file_path = os.path.join(html_dir, filename)
178
-
179
- # Save HTML content
180
- with open(file_path, 'w', encoding='utf-8') as f:
181
- f.write(html_content)
182
-
183
- logger.info("HTML content saved to: %s", file_path)
184
- return file_path
185
-
186
- except Exception as e: # pylint: disable=broad-exception-caught
187
- logger.warning("Failed to save HTML content: %s", str(e))
188
- return ""
189
-
190
-
191
- def setup_chrome_driver_options() -> Options:
192
- """
193
- Create and configure Chrome driver options for web scraping.
194
-
195
- Returns:
196
- Options: Configured Chrome options object
197
- """
198
- chrome_options = Options()
199
- chrome_options.add_argument("--headless") # Run in background
200
- chrome_options.add_argument("--no-sandbox")
201
- chrome_options.add_argument("--disable-dev-shm-usage")
202
- chrome_options.add_argument("--disable-gpu")
203
- chrome_options.add_argument("--window-size=1920,1080")
204
- chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) " +
205
- "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
206
- chrome_options.add_argument("--disable-blink-features=AutomationControlled")
207
- chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
208
- chrome_options.add_experimental_option('useAutomationExtension', False)
209
-
210
- return chrome_options
 
1
  """
2
+ Functions for acquiring context from various sources.
3
+ """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
packages.txt DELETED
@@ -1 +0,0 @@
1
- chromium-driver
 
 
requirements.txt CHANGED
@@ -1,3 +1 @@
1
- gradio==5.35.0
2
- selenium>=4.0.0
3
- webdriver-manager>=3.8.0
 
1
+ gradio==5.35.0
 
 
resumate.py CHANGED
@@ -1,10 +1,10 @@
1
  """
2
  resumate.py
3
 
4
- A simple Gradio UI for collecting user profile and job post URLs.
5
 
6
- This app provides three text input fields for:
7
- - LinkedIn profile URL
8
  - GitHub profile URL
9
  - LinkedIn job post URL
10
 
@@ -15,39 +15,44 @@ To run:
15
  """
16
 
17
  import gradio as gr
18
- from functions.context_acquisition import get_linkedin_profile_html
19
 
20
 
21
- def process_inputs(linkedin_url, github_url, job_post_url):
22
  """
23
- Process the input URLs and retrieve content from LinkedIn profile.
24
 
25
  Args:
26
- linkedin_url (str): LinkedIn profile URL
27
  github_url (str): GitHub profile URL
28
  job_post_url (str): LinkedIn job post URL
29
 
30
  Returns:
31
- str: Formatted output with URL information and LinkedIn profile status
32
  """
33
- result = f"LinkedIn: {linkedin_url}\nGitHub: {github_url}\nJob Post: {job_post_url}\n\n"
34
-
35
- # Try to retrieve LinkedIn profile HTML if URL is provided
36
- if linkedin_url and linkedin_url.strip():
37
- try:
38
- result += "Attempting to retrieve LinkedIn profile...\n"
39
- html_content = get_linkedin_profile_html(linkedin_url)
40
- result += f"LinkedIn profile HTML ({len(html_content)} characters)\n"
41
- except Exception as e: # pylint: disable=broad-exception-caught
42
- result += f"❌ Failed to retrieve LinkedIn profile: {str(e)}\n"
 
 
 
 
43
 
44
  return result
45
 
46
  with gr.Blocks() as demo:
47
  gr.Markdown("# Resumate: Profile & Job Post Input")
48
- linkedin_profile = gr.Textbox(
49
- label="LinkedIn Profile URL",
50
- placeholder="Enter your LinkedIn profile URL"
 
 
51
  )
52
 
53
  github_profile = gr.Textbox(
@@ -61,11 +66,11 @@ with gr.Blocks() as demo:
61
  )
62
 
63
  submit_btn = gr.Button("Submit")
64
- output = gr.Textbox(label="Output", lines=3)
65
 
66
  submit_btn.click( # pylint: disable=no-member
67
  process_inputs,
68
- inputs=[linkedin_profile, github_profile, job_post],
69
  outputs=output
70
  )
71
 
 
1
  """
2
  resumate.py
3
 
4
+ A simple Gradio UI for collecting user profile and job post information.
5
 
6
+ This app provides inputs for:
7
+ - LinkedIn resume export PDF file upload
8
  - GitHub profile URL
9
  - LinkedIn job post URL
10
 
 
15
  """
16
 
17
  import gradio as gr
 
18
 
19
 
20
+ def process_inputs(linkedin_pdf, github_url, job_post_url):
21
  """
22
+ Process the input files and URLs.
23
 
24
  Args:
25
+ linkedin_pdf: Uploaded LinkedIn resume export PDF file
26
  github_url (str): GitHub profile URL
27
  job_post_url (str): LinkedIn job post URL
28
 
29
  Returns:
30
+ str: Formatted output with file and URL information
31
  """
32
+ result = ""
33
+
34
+ # Process LinkedIn PDF file
35
+ if linkedin_pdf is not None:
36
+ result += f"✅ LinkedIn Resume PDF uploaded: {linkedin_pdf.name}\n"
37
+ result += f" File size: {len(linkedin_pdf.read())} bytes\n\n"
38
+ # Reset file pointer for potential future use
39
+ linkedin_pdf.seek(0)
40
+ else:
41
+ result += "❌ No LinkedIn resume PDF file uploaded\n\n"
42
+
43
+ # Process other inputs
44
+ result += f"GitHub Profile: {github_url if github_url else 'Not provided'}\n"
45
+ result += f"Job Post URL: {job_post_url if job_post_url else 'Not provided'}\n"
46
 
47
  return result
48
 
49
  with gr.Blocks() as demo:
50
  gr.Markdown("# Resumate: Profile & Job Post Input")
51
+
52
+ linkedin_pdf = gr.File(
53
+ label="LinkedIn Resume Export PDF",
54
+ file_types=[".pdf"],
55
+ file_count="single"
56
  )
57
 
58
  github_profile = gr.Textbox(
 
66
  )
67
 
68
  submit_btn = gr.Button("Submit")
69
+ output = gr.Textbox(label="Output", lines=5)
70
 
71
  submit_btn.click( # pylint: disable=no-member
72
  process_inputs,
73
+ inputs=[linkedin_pdf, github_profile, job_post],
74
  outputs=output
75
  )
76
 
tests/test_context_acquisition.py CHANGED
@@ -1,252 +1,3 @@
1
  """
2
  Unit tests for the context_acquisition module.
3
  """
4
-
5
- import unittest
6
- import os
7
- import tempfile
8
- import shutil
9
- from selenium.webdriver.chrome.options import Options
10
-
11
- import functions.context_acquisition
12
-
13
- # Import the functions to test
14
- from functions.context_acquisition import (
15
- _clean_html_content,
16
- _save_html_to_file,
17
- setup_chrome_driver_options
18
- )
19
-
20
-
21
- class TestCleanHTMLContent(unittest.TestCase):
22
- """Test cases for the _clean_html_content function."""
23
-
24
- def test_remove_blank_lines(self):
25
- """Test removal of blank lines from HTML content."""
26
- html_with_blanks = """<html>
27
-
28
- <head>
29
- <title>Test</title>
30
-
31
- </head>
32
-
33
- <body>
34
- <div>Content</div>
35
-
36
- </body>
37
- </html>"""
38
-
39
- expected = """<html>
40
- <head>
41
- <title>Test</title>
42
- </head>
43
- <body>
44
- <div>Content</div>
45
- </body>
46
- </html>"""
47
-
48
- result = _clean_html_content(html_with_blanks)
49
- self.assertEqual(result, expected)
50
-
51
- def test_strip_trailing_whitespace(self):
52
- """Test removal of trailing whitespace from lines."""
53
- html_with_trailing = "<div>Content</div> \n<p>Text</p>\t\n"
54
- expected = "<div>Content</div>\n<p>Text</p>"
55
-
56
- result = _clean_html_content(html_with_trailing)
57
- self.assertEqual(result, expected)
58
-
59
- def test_empty_content(self):
60
- """Test handling of empty or whitespace-only content."""
61
- self.assertEqual(_clean_html_content(""), "")
62
- self.assertEqual(_clean_html_content(" \n\n\t "), "")
63
- self.assertEqual(_clean_html_content("\n"), "")
64
-
65
- def test_single_line_content(self):
66
- """Test cleaning of single line content."""
67
- single_line = "<html><body>Content</body></html>"
68
- result = _clean_html_content(single_line)
69
- self.assertEqual(result, single_line)
70
-
71
- def test_mixed_whitespace(self):
72
- """Test handling of mixed whitespace characters."""
73
- mixed = "<div>\t\n \n\r\n<p>Text</p>\n \n</div>"
74
- expected = "<div>\n<p>Text</p>\n</div>"
75
- result = _clean_html_content(mixed)
76
- self.assertEqual(result, expected)
77
-
78
-
79
- class TestSaveHTMLToFile(unittest.TestCase):
80
- """Test cases for the _save_html_to_file function."""
81
-
82
- def setUp(self):
83
- """Set up test fixtures with temporary directory."""
84
- self.test_dir = tempfile.mkdtemp()
85
- self.test_html = "<html><body>Test content</body></html>"
86
- self.test_url = "https://www.linkedin.com/in/johndoe"
87
-
88
- def tearDown(self):
89
- """Clean up temporary directory."""
90
- if os.path.exists(self.test_dir):
91
- shutil.rmtree(self.test_dir)
92
-
93
- def test_successful_file_save(self):
94
- """Test successful saving of HTML content to file."""
95
- # Temporarily change the file path calculation
96
- original_dirname = os.path.dirname
97
-
98
- def mock_dirname(path):
99
- if path.endswith('context_acquisition.py'):
100
- return self.test_dir
101
- return original_dirname(path)
102
-
103
- # Replace os.path.dirname temporarily
104
- original_func = functions.context_acquisition.os.path.dirname
105
- functions.context_acquisition.os.path.dirname = mock_dirname
106
-
107
- try:
108
- result = _save_html_to_file(self.test_html, self.test_url)
109
-
110
- # Verify file was created
111
- self.assertTrue(os.path.exists(result))
112
- self.assertTrue(result.endswith('.html'))
113
-
114
- # Verify file content
115
- with open(result, 'r', encoding='utf-8') as f:
116
- content = f.read()
117
- self.assertEqual(content, self.test_html)
118
-
119
- finally:
120
- # Restore original function
121
- functions.context_acquisition.os.path.dirname = original_func
122
-
123
-
124
- class TestSetupChromeDriverOptions(unittest.TestCase):
125
- """Test cases for the setup_chrome_driver_options function."""
126
-
127
- def test_chrome_options_configuration(self):
128
- """Test that Chrome options are properly configured."""
129
- options = setup_chrome_driver_options()
130
-
131
- # Verify that options object is returned
132
- self.assertIsNotNone(options)
133
-
134
- # Verify it's the correct type
135
- self.assertIsInstance(options, Options)
136
-
137
- def test_chrome_options_arguments(self):
138
- """Test that required Chrome arguments are set."""
139
- options = setup_chrome_driver_options()
140
-
141
- # Access the arguments (this is implementation dependent)
142
- # Note: This test verifies the function runs without error
143
- # Specific argument verification would require accessing private attributes
144
- self.assertIsNotNone(options)
145
-
146
-
147
- class TestURLValidation(unittest.TestCase):
148
- """Test cases for URL validation logic (extracted from main function)."""
149
-
150
- def test_valid_linkedin_urls(self):
151
- """Test validation of valid LinkedIn URLs."""
152
- valid_urls = [
153
- "https://www.linkedin.com/in/johndoe",
154
- "https://linkedin.com/in/jane-smith",
155
- "http://www.linkedin.com/in/test123",
156
- "https://www.linkedin.com/in/user-name-with-dashes",
157
- ]
158
-
159
- for url in valid_urls:
160
- # Test the validation logic directly
161
- self.assertTrue(isinstance(url, str))
162
- self.assertTrue(url.strip())
163
- self.assertIn("linkedin.com/in/", url)
164
-
165
- def test_invalid_linkedin_urls(self):
166
- """Test validation of invalid LinkedIn URLs."""
167
- invalid_urls = [
168
- "",
169
- None,
170
- "https://www.example.com/profile",
171
- "https://www.linkedin.com/company/test",
172
- "https://github.com/user",
173
- "not-a-url",
174
- ]
175
-
176
- for url in invalid_urls:
177
- # Test the validation logic directly
178
- if url is None or not isinstance(url, str):
179
- self.assertTrue(url is None or not isinstance(url, str))
180
- elif not url.strip():
181
- self.assertFalse(url.strip())
182
- else:
183
- self.assertNotIn("linkedin.com/in/", url)
184
-
185
-
186
- class TestHTMLContentProcessing(unittest.TestCase):
187
- """Test cases for HTML content processing workflows."""
188
-
189
- def test_html_cleaning_workflow(self):
190
- """Test the complete HTML cleaning workflow."""
191
- raw_html = """<!DOCTYPE html>
192
- <html>
193
-
194
- <head>
195
- <title>LinkedIn Profile</title>
196
-
197
- </head>
198
-
199
- <body>
200
- <div class="profile">
201
- <h1>John Doe</h1>
202
-
203
- <p>Software Engineer</p>
204
- </div>
205
-
206
- </body>
207
-
208
- </html>"""
209
-
210
- cleaned = _clean_html_content(raw_html)
211
-
212
- # Verify no empty lines
213
- lines = cleaned.split('\n')
214
- for line in lines:
215
- self.assertTrue(line.strip(), f"Found empty line: '{line}'")
216
-
217
- # Verify content is preserved
218
- self.assertIn("John Doe", cleaned)
219
- self.assertIn("Software Engineer", cleaned)
220
- self.assertIn("LinkedIn Profile", cleaned)
221
-
222
- def test_minimal_html_cleaning(self):
223
- """Test cleaning of minimal HTML content."""
224
- minimal_html = "<html><body>Content</body></html>"
225
- result = _clean_html_content(minimal_html)
226
- self.assertEqual(result, minimal_html)
227
-
228
- def test_complex_whitespace_patterns(self):
229
- """Test cleaning of complex whitespace patterns."""
230
- complex_html = """<div>
231
- \t\t
232
- <span>Text</span>
233
- \t
234
-
235
- <p>Paragraph</p>
236
- \t
237
- </div>"""
238
-
239
- result = _clean_html_content(complex_html)
240
- lines = result.split('\n')
241
-
242
- # Should have no empty lines
243
- for line in lines:
244
- self.assertTrue(line.strip())
245
-
246
- # Should preserve content
247
- self.assertIn("Text", result)
248
- self.assertIn("Paragraph", result)
249
-
250
-
251
- if __name__ == '__main__':
252
- unittest.main()
 
1
  """
2
  Unit tests for the context_acquisition module.
3
  """