gperdrizet commited on
Commit
4f75930
·
verified ·
1 Parent(s): df6f062

Added function to save linkedin profile html.

Browse files
Files changed (2) hide show
  1. .gitignore +2 -1
  2. functions/context_acquisition.py +52 -1
.gitignore CHANGED
@@ -1,3 +1,4 @@
1
  __pycache__
2
  .vscode
3
- .venv
 
 
1
  __pycache__
2
  .vscode
3
+ .venv
4
+ html
functions/context_acquisition.py CHANGED
@@ -7,6 +7,9 @@ GitHub profiles, and job postings using browser automation.
7
 
8
  import time
9
  import logging
 
 
 
10
 
11
  from selenium import webdriver
12
  from selenium.webdriver.chrome.options import Options
@@ -23,6 +26,7 @@ logger = logging.getLogger(__name__)
23
  def get_linkedin_profile_html(profile_url: str, wait_time: int = 10) -> str:
24
  """
25
  Retrieve the HTML content of a LinkedIn profile using browser automation.
 
26
 
27
  Args:
28
  profile_url (str): The URL of the LinkedIn profile to scrape
@@ -35,6 +39,9 @@ def get_linkedin_profile_html(profile_url: str, wait_time: int = 10) -> str:
35
  ValueError: If the URL is not a valid LinkedIn profile URL
36
  WebDriverException: If there's an issue with the browser automation
37
  TimeoutException: If the page takes too long to load
 
 
 
38
  """
39
 
40
  # Validate LinkedIn URL
@@ -91,8 +98,11 @@ def get_linkedin_profile_html(profile_url: str, wait_time: int = 10) -> str:
91
  )),
92
  )
93
  )
 
94
  except TimeoutException:
95
- logger.warning("Standard LinkedIn elements not found, proceeding with current page state")
 
 
96
 
97
  # Additional wait to ensure dynamic content loads
98
  time.sleep(2)
@@ -101,6 +111,10 @@ def get_linkedin_profile_html(profile_url: str, wait_time: int = 10) -> str:
101
  html_content = driver.page_source
102
 
103
  logger.info("Successfully retrieved HTML content (%d characters)", len(html_content))
 
 
 
 
104
  return html_content
105
 
106
  except WebDriverException as e:
@@ -121,6 +135,43 @@ def get_linkedin_profile_html(profile_url: str, wait_time: int = 10) -> str:
121
  logger.warning("Error closing browser: %s", str(e))
122
 
123
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
  def setup_chrome_driver_options() -> Options:
125
  """
126
  Create and configure Chrome driver options for web scraping.
 
7
 
8
  import time
9
  import logging
10
+ import os
11
+ from datetime import datetime
12
+ from urllib.parse import urlparse
13
 
14
  from selenium import webdriver
15
  from selenium.webdriver.chrome.options import Options
 
26
  def get_linkedin_profile_html(profile_url: str, wait_time: int = 10) -> str:
27
  """
28
  Retrieve the HTML content of a LinkedIn profile using browser automation.
29
+ The HTML content is saved to the html directory and also returned.
30
 
31
  Args:
32
  profile_url (str): The URL of the LinkedIn profile to scrape
 
39
  ValueError: If the URL is not a valid LinkedIn profile URL
40
  WebDriverException: If there's an issue with the browser automation
41
  TimeoutException: If the page takes too long to load
42
+
43
+ Note:
44
+ The HTML content is automatically saved to html/linkedin_profile_<name>_<timestamp>.html
45
  """
46
 
47
  # Validate LinkedIn URL
 
98
  )),
99
  )
100
  )
101
+
102
  except TimeoutException:
103
+ logger.warning(
104
+ "Standard LinkedIn elements not found, proceeding with current page state"
105
+ )
106
 
107
  # Additional wait to ensure dynamic content loads
108
  time.sleep(2)
 
111
  html_content = driver.page_source
112
 
113
  logger.info("Successfully retrieved HTML content (%d characters)", len(html_content))
114
+
115
+ # Save HTML content to file
116
+ _save_html_to_file(html_content, profile_url)
117
+
118
  return html_content
119
 
120
  except WebDriverException as e:
 
135
  logger.warning("Error closing browser: %s", str(e))
136
 
137
 
138
+ def _save_html_to_file(html_content: str, profile_url: str) -> str:
139
+ """
140
+ Save HTML content to a file in the html directory.
141
+
142
+ Args:
143
+ html_content (str): The HTML content to save
144
+ profile_url (str): The original profile URL for filename generation
145
+
146
+ Returns:
147
+ str: The path to the saved file
148
+ """
149
+ try:
150
+ # Create html directory if it doesn't exist
151
+ html_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'html')
152
+ os.makedirs(html_dir, exist_ok=True)
153
+
154
+ # Generate filename from URL and timestamp
155
+ parsed_url = urlparse(profile_url)
156
+ profile_name = parsed_url.path.split('/')[-1] or 'unknown_profile'
157
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
158
+ filename = f"linkedin_profile_{profile_name}_{timestamp}.html"
159
+
160
+ # Full file path
161
+ file_path = os.path.join(html_dir, filename)
162
+
163
+ # Save HTML content
164
+ with open(file_path, 'w', encoding='utf-8') as f:
165
+ f.write(html_content)
166
+
167
+ logger.info("HTML content saved to: %s", file_path)
168
+ return file_path
169
+
170
+ except Exception as e: # pylint: disable=broad-exception-caught
171
+ logger.warning("Failed to save HTML content: %s", str(e))
172
+ return ""
173
+
174
+
175
  def setup_chrome_driver_options() -> Options:
176
  """
177
  Create and configure Chrome driver options for web scraping.