gperdrizet commited on
Commit
f1fa456
·
verified ·
1 Parent(s): 5ba8d84

Added LinkedIn profile scraping functions.

Browse files
functions/__init__.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Functions package for the resumate application.
3
+
4
+ This package contains modules for data acquisition, processing, and analysis
5
+ of LinkedIn profiles, GitHub profiles, and job postings.
6
+ """
7
+
8
+ from .data_acquisition import get_linkedin_profile_html
9
+
10
+ __all__ = ['get_linkedin_profile_html']
functions/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (422 Bytes). View file
 
functions/__pycache__/context_acquisition.cpython-310.pyc ADDED
Binary file (4.01 kB). View file
 
functions/__pycache__/data_acquisition.cpython-310.pyc ADDED
Binary file (4 kB). View file
 
functions/context_acquisition.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ context_acquisition.py
3
+
4
+ Functions for acquiring context from various sources including LinkedIn profiles,
5
+ GitHub profiles, and job postings using browser automation.
6
+ """
7
+
8
+ from selenium import webdriver
9
+ from selenium.webdriver.chrome.options import Options
10
+ from selenium.webdriver.common.by import By
11
+ from selenium.webdriver.support.ui import WebDriverWait
12
+ from selenium.webdriver.support import expected_conditions as EC
13
+ from selenium.common.exceptions import TimeoutException, WebDriverException
14
+ import time
15
+ import logging
16
+
17
+ # Set up logging
18
+ logging.basicConfig(level=logging.INFO)
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ def get_linkedin_profile_html(profile_url: str, wait_time: int = 10) -> str:
23
+ """
24
+ Retrieve the HTML content of a LinkedIn profile using browser automation.
25
+
26
+ Args:
27
+ profile_url (str): The URL of the LinkedIn profile to scrape
28
+ wait_time (int): Maximum time to wait for page elements to load (default: 10 seconds)
29
+
30
+ Returns:
31
+ str: The HTML content of the LinkedIn profile page
32
+
33
+ Raises:
34
+ ValueError: If the URL is not a valid LinkedIn profile URL
35
+ WebDriverException: If there's an issue with the browser automation
36
+ TimeoutException: If the page takes too long to load
37
+ """
38
+
39
+ # Validate LinkedIn URL
40
+ if not profile_url or not isinstance(profile_url, str):
41
+ raise ValueError("Profile URL must be a non-empty string")
42
+
43
+ if "linkedin.com/in/" not in profile_url:
44
+ raise ValueError("URL must be a valid LinkedIn profile URL (containing 'linkedin.com/in/')")
45
+
46
+ # Configure Chrome options for headless browsing
47
+ chrome_options = Options()
48
+ chrome_options.add_argument("--headless") # Run in background
49
+ chrome_options.add_argument("--no-sandbox")
50
+ chrome_options.add_argument("--disable-dev-shm-usage")
51
+ chrome_options.add_argument("--disable-gpu")
52
+ chrome_options.add_argument("--window-size=1920,1080")
53
+ chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
54
+
55
+ driver = None
56
+ try:
57
+ # Initialize the Chrome driver
58
+ logger.info("Initializing browser for URL: %s", profile_url)
59
+ driver = webdriver.Chrome(options=chrome_options)
60
+ driver.set_page_load_timeout(30)
61
+
62
+ # Navigate to the LinkedIn profile
63
+ logger.info("Navigating to LinkedIn profile...")
64
+ driver.get(profile_url)
65
+
66
+ # Wait for the page to load
67
+ # Look for common LinkedIn profile elements
68
+ wait = WebDriverWait(driver, wait_time)
69
+
70
+ try:
71
+ # Wait for either the main content or login prompt
72
+ wait.until(
73
+ EC.any_of(
74
+ EC.presence_of_element_located((By.CSS_SELECTOR, ".pv-top-card")), # Profile header
75
+ EC.presence_of_element_located((By.CSS_SELECTOR, ".profile-section")), # Profile section
76
+ EC.presence_of_element_located((By.CSS_SELECTOR, ".authwall")), # Auth wall
77
+ EC.presence_of_element_located((By.CSS_SELECTOR, ".public-profile")), # Public profile
78
+ )
79
+ )
80
+ except TimeoutException:
81
+ logger.warning("Standard LinkedIn elements not found, proceeding with current page state")
82
+
83
+ # Additional wait to ensure dynamic content loads
84
+ time.sleep(2)
85
+
86
+ # Get the page HTML
87
+ html_content = driver.page_source
88
+
89
+ logger.info("Successfully retrieved HTML content (%d characters)", len(html_content))
90
+ return html_content
91
+
92
+ except WebDriverException as e:
93
+ logger.error(f"WebDriver error occurred: {str(e)}")
94
+ raise WebDriverException(f"Browser automation failed: {str(e)}")
95
+
96
+ except Exception as e:
97
+ logger.error(f"Unexpected error occurred: {str(e)}")
98
+ raise RuntimeError(f"Failed to retrieve LinkedIn profile: {str(e)}")
99
+
100
+ finally:
101
+ # Always clean up the driver
102
+ if driver:
103
+ try:
104
+ driver.quit()
105
+ logger.info("Browser session closed")
106
+ except Exception as e:
107
+ logger.warning(f"Error closing browser: {str(e)}")
108
+
109
+
110
+ def setup_chrome_driver_options() -> Options:
111
+ """
112
+ Create and configure Chrome driver options for web scraping.
113
+
114
+ Returns:
115
+ Options: Configured Chrome options object
116
+ """
117
+ chrome_options = Options()
118
+ chrome_options.add_argument("--headless")
119
+ chrome_options.add_argument("--no-sandbox")
120
+ chrome_options.add_argument("--disable-dev-shm-usage")
121
+ chrome_options.add_argument("--disable-gpu")
122
+ chrome_options.add_argument("--window-size=1920,1080")
123
+ chrome_options.add_argument("--disable-blink-features=AutomationControlled")
124
+ chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
125
+ chrome_options.add_experimental_option('useAutomationExtension', False)
126
+
127
+ return chrome_options
resumate.py CHANGED
@@ -15,14 +15,33 @@ To run:
15
  """
16
 
17
  import gradio as gr
 
18
 
19
 
20
  def process_inputs(linkedin_url, github_url, job_post_url):
21
  """
22
- Placeholder function to process the input URLs.
23
- Replace this docstring and logic with actual implementation as needed.
 
 
 
 
 
 
 
24
  """
25
- return f"LinkedIn: {linkedin_url}\nGitHub: {github_url}\nJob Post: {job_post_url}"
 
 
 
 
 
 
 
 
 
 
 
26
 
27
  with gr.Blocks() as demo:
28
  gr.Markdown("# Resumate: Profile & Job Post Input")
 
15
  """
16
 
17
  import gradio as gr
18
+ from functions.context_acquisition import get_linkedin_profile_html
19
 
20
 
21
  def process_inputs(linkedin_url, github_url, job_post_url):
22
  """
23
+ Process the input URLs and retrieve content from LinkedIn profile.
24
+
25
+ Args:
26
+ linkedin_url (str): LinkedIn profile URL
27
+ github_url (str): GitHub profile URL
28
+ job_post_url (str): LinkedIn job post URL
29
+
30
+ Returns:
31
+ str: Formatted output with URL information and LinkedIn profile status
32
  """
33
+ result = f"LinkedIn: {linkedin_url}\nGitHub: {github_url}\nJob Post: {job_post_url}\n\n"
34
+
35
+ # Try to retrieve LinkedIn profile HTML if URL is provided
36
+ if linkedin_url and linkedin_url.strip():
37
+ try:
38
+ result += "Attempting to retrieve LinkedIn profile...\n"
39
+ html_content = get_linkedin_profile_html(linkedin_url)
40
+ result += f"✅ Successfully retrieved LinkedIn profile HTML ({len(html_content)} characters)\n"
41
+ except Exception as e: # pylint: disable=broad-exception-caught
42
+ result += f"❌ Failed to retrieve LinkedIn profile: {str(e)}\n"
43
+
44
+ return result
45
 
46
  with gr.Blocks() as demo:
47
  gr.Markdown("# Resumate: Profile & Job Post Input")