gperdrizet commited on
Commit
7dcc57a
·
verified ·
1 Parent(s): 5826d5a

Fixed LinkedIn HTML file name.

Browse files
Files changed (1) hide show
  1. functions/context_acquisition.py +29 -7
functions/context_acquisition.py CHANGED
@@ -8,7 +8,6 @@ GitHub profiles, and job postings using browser automation.
8
  import time
9
  import logging
10
  import os
11
- from datetime import datetime
12
  from urllib.parse import urlparse
13
 
14
  from selenium import webdriver
@@ -110,12 +109,16 @@ def get_linkedin_profile_html(profile_url: str, wait_time: int = 10) -> str:
110
  # Get the page HTML
111
  html_content = driver.page_source
112
 
113
- logger.info("Successfully retrieved HTML content (%d characters)", len(html_content))
 
 
 
 
114
 
115
  # Save HTML content to file
116
- _save_html_to_file(html_content, profile_url)
117
 
118
- return html_content
119
 
120
  except WebDriverException as e:
121
  logger.error("WebDriver error occurred: %s", str(e))
@@ -135,6 +138,24 @@ def get_linkedin_profile_html(profile_url: str, wait_time: int = 10) -> str:
135
  logger.warning("Error closing browser: %s", str(e))
136
 
137
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
  def _save_html_to_file(html_content: str, profile_url: str) -> str:
139
  """
140
  Save HTML content to a file in the html directory.
@@ -153,9 +174,10 @@ def _save_html_to_file(html_content: str, profile_url: str) -> str:
153
 
154
  # Generate filename from URL and timestamp
155
  parsed_url = urlparse(profile_url)
156
- profile_name = parsed_url.path.split('/')[-1] or 'unknown_profile'
157
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
158
- filename = f"linkedin_profile_{profile_name}_{timestamp}.html"
 
159
 
160
  # Full file path
161
  file_path = os.path.join(html_dir, filename)
 
8
  import time
9
  import logging
10
  import os
 
11
  from urllib.parse import urlparse
12
 
13
  from selenium import webdriver
 
109
  # Get the page HTML
110
  html_content = driver.page_source
111
 
112
+ # Clean up HTML by removing blank lines
113
+ cleaned_html = _clean_html_content(html_content)
114
+
115
+ logger.info("Successfully retrieved HTML content (%d characters, cleaned to %d characters)",
116
+ len(html_content), len(cleaned_html))
117
 
118
  # Save HTML content to file
119
+ _save_html_to_file(cleaned_html, profile_url)
120
 
121
+ return cleaned_html
122
 
123
  except WebDriverException as e:
124
  logger.error("WebDriver error occurred: %s", str(e))
 
138
  logger.warning("Error closing browser: %s", str(e))
139
 
140
 
141
+ def _clean_html_content(html_content: str) -> str:
142
+ """
143
+ Clean HTML content by removing blank lines and excessive whitespace.
144
+
145
+ Args:
146
+ html_content (str): The raw HTML content to clean
147
+
148
+ Returns:
149
+ str: Cleaned HTML content with blank lines removed
150
+ """
151
+ # Split into lines, strip whitespace, and filter out empty lines
152
+ lines = html_content.split('\n')
153
+ cleaned_lines = [line.rstrip() for line in lines if line.strip()]
154
+
155
+ # Join back together with single newlines
156
+ return '\n'.join(cleaned_lines)
157
+
158
+
159
  def _save_html_to_file(html_content: str, profile_url: str) -> str:
160
  """
161
  Save HTML content to a file in the html directory.
 
174
 
175
  # Generate filename from URL and timestamp
176
  parsed_url = urlparse(profile_url)
177
+ print(parsed_url)
178
+ profile_name = parsed_url.path.split('/')[2] or 'unknown_profile'
179
+ print(profile_name)
180
+ filename = f"linkedin_profile_{profile_name}.html"
181
 
182
  # Full file path
183
  file_path = os.path.join(html_dir, filename)