gperdrizet commited on
Commit
b9464fb
·
verified ·
1 Parent(s): bcdc087

General clean up, added save of results to linkedin resume parsing and github repo list retreival.

Browse files
functions/github.py CHANGED
@@ -5,9 +5,12 @@ Functions for retrieving information from GitHub profiles and repositories.
5
  """
6
 
7
  import re
 
8
  import logging
9
- import requests
10
  from typing import List, Dict, Optional
 
 
 
11
 
12
  # Set up logging
13
  logging.basicConfig(level=logging.INFO)
@@ -49,29 +52,32 @@ def get_github_repositories(github_url: str) -> Dict:
49
  """
50
  if not github_url or not github_url.strip():
51
  return {"status": "error", "message": "No GitHub URL provided"}
52
-
53
  try:
54
  # Extract username from GitHub URL
55
  username = _extract_github_username(github_url)
 
56
  if not username:
57
  return {"status": "error", "message": "Invalid GitHub URL format"}
58
-
59
- logger.info(f"Fetching repositories for GitHub user: {username}")
60
-
61
  # Get user info first
62
  user_info = _get_github_user_info(username)
 
63
  if user_info["status"] != "success":
64
  return user_info
65
-
66
  # Get repositories
67
  repositories = _get_user_repositories(username)
 
68
  if repositories["status"] != "success":
69
  return repositories
70
-
71
  # Process and structure repository data
72
  processed_repos = _process_repository_data(repositories["data"])
73
-
74
- return {
75
  "status": "success",
76
  "repositories": processed_repos,
77
  "metadata": {
@@ -82,9 +88,25 @@ def get_github_repositories(github_url: str) -> Dict:
82
  },
83
  "message": f"Successfully retrieved {len(processed_repos)} repositories"
84
  }
85
-
86
- except Exception as e:
87
- logger.error(f"Error retrieving GitHub repositories: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  return {
89
  "status": "error",
90
  "message": f"Failed to retrieve GitHub repositories: {str(e)}"
@@ -104,26 +126,29 @@ def _extract_github_username(github_url: str) -> Optional[str]:
104
  try:
105
  # Clean up the URL
106
  url = github_url.strip().rstrip('/')
107
-
108
  # Handle various GitHub URL formats
109
  patterns = [
110
  r'github\.com/([^/]+)/?$', # https://github.com/username
111
  r'github\.com/([^/]+)/.*', # https://github.com/username/anything
112
  r'^([a-zA-Z0-9\-_]+)$' # Just username
113
  ]
114
-
115
  for pattern in patterns:
116
  match = re.search(pattern, url)
 
117
  if match:
118
  username = match.group(1)
 
119
  # Validate username format
120
  if re.match(r'^[a-zA-Z0-9\-_]+$', username) and len(username) <= 39:
121
  return username
122
-
123
  return None
124
-
125
- except Exception as e:
126
- logger.warning(f"Error extracting username from URL {github_url}: {str(e)}")
 
127
  return None
128
 
129
 
@@ -143,20 +168,23 @@ def _get_github_user_info(username: str) -> Dict:
143
  "Accept": "application/vnd.github.v3+json",
144
  "User-Agent": "Resumate-App/1.0"
145
  }
146
-
147
  response = requests.get(url, headers=headers, timeout=10)
148
-
149
  if response.status_code == 404:
150
  return {"status": "error", "message": f"GitHub user '{username}' not found"}
 
151
  elif response.status_code == 403:
152
  return {"status": "error", "message": "GitHub API rate limit exceeded"}
 
153
  elif response.status_code != 200:
154
  return {"status": "error", "message": f"GitHub API error: {response.status_code}"}
155
-
156
  return {"status": "success", "data": response.json()}
157
-
158
  except requests.RequestException as e:
159
- logger.error(f"Network error fetching user info: {str(e)}")
 
160
  return {"status": "error", "message": f"Network error: {str(e)}"}
161
 
162
 
@@ -175,8 +203,9 @@ def _get_user_repositories(username: str) -> Dict:
175
  all_repos = []
176
  page = 1
177
  per_page = 100 # Maximum allowed by GitHub API
178
-
179
  while True:
 
180
  url = f"https://api.github.com/users/{username}/repos"
181
  params = {
182
  "type": "public",
@@ -189,32 +218,34 @@ def _get_user_repositories(username: str) -> Dict:
189
  "Accept": "application/vnd.github.v3+json",
190
  "User-Agent": "Resumate-App/1.0"
191
  }
192
-
193
  response = requests.get(url, headers=headers, params=params, timeout=10)
194
-
195
  if response.status_code != 200:
196
  return {"status": "error", "message": f"GitHub API error: {response.status_code}"}
197
-
198
  repos = response.json()
 
199
  if not repos: # No more repositories
200
  break
201
-
202
  all_repos.extend(repos)
203
-
204
  # If we got less than per_page, we've reached the end
205
  if len(repos) < per_page:
206
  break
207
-
208
  page += 1
209
-
210
  # Safety limit to prevent infinite loops
211
  if page > 10: # Max 1000 repos
212
  break
213
-
214
  return {"status": "success", "data": all_repos}
215
-
216
  except requests.RequestException as e:
217
- logger.error(f"Network error fetching repositories: {str(e)}")
 
218
  return {"status": "error", "message": f"Network error: {str(e)}"}
219
 
220
 
@@ -229,12 +260,13 @@ def _process_repository_data(repos: List[Dict]) -> List[Dict]:
229
  List[Dict]: Processed repository data
230
  """
231
  processed = []
232
-
233
  for repo in repos:
 
234
  # Skip forks unless they have significant modifications
235
  if repo.get("fork", False) and repo.get("stargazers_count", 0) == 0:
236
  continue
237
-
238
  processed_repo = {
239
  "name": repo.get("name", ""),
240
  "description": repo.get("description", ""),
@@ -252,9 +284,9 @@ def _process_repository_data(repos: List[Dict]) -> List[Dict]:
252
  "has_wiki": repo.get("has_wiki", False),
253
  "has_pages": repo.get("has_pages", False)
254
  }
255
-
256
  processed.append(processed_repo)
257
-
258
  return processed
259
 
260
 
@@ -268,49 +300,51 @@ def format_repositories_for_llm(github_result: Dict) -> str:
268
  Returns:
269
  str: Formatted text ready for LLM context
270
  """
 
271
  if github_result.get("status") != "success":
272
- return f"GitHub repositories could not be retrieved: {github_result.get('message', 'Unknown error')}"
273
-
 
274
  repositories = github_result.get("repositories", [])
275
  metadata = github_result.get("metadata", {})
276
-
277
  if not repositories:
278
  return f"No public repositories found for {metadata.get('username', 'user')}"
279
-
280
  formatted_parts = [
281
  "=== GITHUB REPOSITORIES ===\n",
282
  f"Profile: {metadata.get('profile_url', 'N/A')}",
283
  f"Username: {metadata.get('username', 'N/A')}",
284
  f"Public Repositories: {len(repositories)}\n"
285
  ]
286
-
287
  for i, repo in enumerate(repositories[:20], 1): # Limit to top 20 repos
288
  repo_info = [
289
  f"[REPOSITORY {i}]",
290
  f"Name: {repo['name']}",
291
  f"URL: {repo['html_url']}"
292
  ]
293
-
294
  if repo['description']:
295
  repo_info.append(f"Description: {repo['description']}")
296
-
297
  if repo['language']:
298
  repo_info.append(f"Primary Language: {repo['language']}")
299
-
300
  if repo['topics']:
301
  repo_info.append(f"Topics: {', '.join(repo['topics'][:5])}") # Limit topics
302
-
303
  repo_info.extend([
304
  f"Stars: {repo['stars']} | Forks: {repo['forks']}",
305
  f"Last Updated: {repo['updated_at'][:10]}", # Just the date
306
  "" # Empty line between repositories
307
  ])
308
-
309
  formatted_parts.extend(repo_info)
310
-
311
  if len(repositories) > 20:
312
  formatted_parts.append(f"... and {len(repositories) - 20} more repositories")
313
-
314
  formatted_parts.append("\n=== END GITHUB REPOSITORIES ===")
315
-
316
  return '\n'.join(formatted_parts)
 
5
  """
6
 
7
  import re
8
+ import json
9
  import logging
 
10
  from typing import List, Dict, Optional
11
+ from pathlib import Path
12
+
13
+ import requests
14
 
15
  # Set up logging
16
  logging.basicConfig(level=logging.INFO)
 
52
  """
53
  if not github_url or not github_url.strip():
54
  return {"status": "error", "message": "No GitHub URL provided"}
55
+
56
  try:
57
  # Extract username from GitHub URL
58
  username = _extract_github_username(github_url)
59
+
60
  if not username:
61
  return {"status": "error", "message": "Invalid GitHub URL format"}
62
+
63
+ logger.info("Fetching repositories for GitHub user: %s", username)
64
+
65
  # Get user info first
66
  user_info = _get_github_user_info(username)
67
+
68
  if user_info["status"] != "success":
69
  return user_info
70
+
71
  # Get repositories
72
  repositories = _get_user_repositories(username)
73
+
74
  if repositories["status"] != "success":
75
  return repositories
76
+
77
  # Process and structure repository data
78
  processed_repos = _process_repository_data(repositories["data"])
79
+
80
+ result = {
81
  "status": "success",
82
  "repositories": processed_repos,
83
  "metadata": {
 
88
  },
89
  "message": f"Successfully retrieved {len(processed_repos)} repositories"
90
  }
91
+
92
+ # Save results to JSON file
93
+ try:
94
+ data_dir = Path(__file__).parent.parent / "data"
95
+ data_dir.mkdir(exist_ok=True)
96
+
97
+ output_file = data_dir / "github_repos.json"
98
+ with open(output_file, 'w', encoding='utf-8') as f:
99
+ json.dump(result, f, indent=2, ensure_ascii=False)
100
+
101
+ logger.info("GitHub repositories saved to %s", output_file)
102
+ except Exception as save_error: # pylint: disable=broad-exception-caught
103
+ logger.warning("Failed to save GitHub repositories to file: %s", str(save_error))
104
+
105
+ return result
106
+
107
+ except Exception as e: # pylint: disable=broad-exception-caught
108
+ logger.error("Error retrieving GitHub repositories: %s", str(e))
109
+
110
  return {
111
  "status": "error",
112
  "message": f"Failed to retrieve GitHub repositories: {str(e)}"
 
126
  try:
127
  # Clean up the URL
128
  url = github_url.strip().rstrip('/')
129
+
130
  # Handle various GitHub URL formats
131
  patterns = [
132
  r'github\.com/([^/]+)/?$', # https://github.com/username
133
  r'github\.com/([^/]+)/.*', # https://github.com/username/anything
134
  r'^([a-zA-Z0-9\-_]+)$' # Just username
135
  ]
136
+
137
  for pattern in patterns:
138
  match = re.search(pattern, url)
139
+
140
  if match:
141
  username = match.group(1)
142
+
143
  # Validate username format
144
  if re.match(r'^[a-zA-Z0-9\-_]+$', username) and len(username) <= 39:
145
  return username
146
+
147
  return None
148
+
149
+ except Exception as e: # pylint: disable=broad-exception-caught
150
+ logger.warning("Error extracting username from URL %s: %s", github_url, str(e))
151
+
152
  return None
153
 
154
 
 
168
  "Accept": "application/vnd.github.v3+json",
169
  "User-Agent": "Resumate-App/1.0"
170
  }
171
+
172
  response = requests.get(url, headers=headers, timeout=10)
173
+
174
  if response.status_code == 404:
175
  return {"status": "error", "message": f"GitHub user '{username}' not found"}
176
+
177
  elif response.status_code == 403:
178
  return {"status": "error", "message": "GitHub API rate limit exceeded"}
179
+
180
  elif response.status_code != 200:
181
  return {"status": "error", "message": f"GitHub API error: {response.status_code}"}
182
+
183
  return {"status": "success", "data": response.json()}
184
+
185
  except requests.RequestException as e:
186
+ logger.error("Network error fetching user info: %s", str(e))
187
+
188
  return {"status": "error", "message": f"Network error: {str(e)}"}
189
 
190
 
 
203
  all_repos = []
204
  page = 1
205
  per_page = 100 # Maximum allowed by GitHub API
206
+
207
  while True:
208
+
209
  url = f"https://api.github.com/users/{username}/repos"
210
  params = {
211
  "type": "public",
 
218
  "Accept": "application/vnd.github.v3+json",
219
  "User-Agent": "Resumate-App/1.0"
220
  }
221
+
222
  response = requests.get(url, headers=headers, params=params, timeout=10)
223
+
224
  if response.status_code != 200:
225
  return {"status": "error", "message": f"GitHub API error: {response.status_code}"}
226
+
227
  repos = response.json()
228
+
229
  if not repos: # No more repositories
230
  break
231
+
232
  all_repos.extend(repos)
233
+
234
  # If we got less than per_page, we've reached the end
235
  if len(repos) < per_page:
236
  break
237
+
238
  page += 1
239
+
240
  # Safety limit to prevent infinite loops
241
  if page > 10: # Max 1000 repos
242
  break
243
+
244
  return {"status": "success", "data": all_repos}
245
+
246
  except requests.RequestException as e:
247
+ logger.error("Network error fetching repositories: %s", str(e))
248
+
249
  return {"status": "error", "message": f"Network error: {str(e)}"}
250
 
251
 
 
260
  List[Dict]: Processed repository data
261
  """
262
  processed = []
263
+
264
  for repo in repos:
265
+
266
  # Skip forks unless they have significant modifications
267
  if repo.get("fork", False) and repo.get("stargazers_count", 0) == 0:
268
  continue
269
+
270
  processed_repo = {
271
  "name": repo.get("name", ""),
272
  "description": repo.get("description", ""),
 
284
  "has_wiki": repo.get("has_wiki", False),
285
  "has_pages": repo.get("has_pages", False)
286
  }
287
+
288
  processed.append(processed_repo)
289
+
290
  return processed
291
 
292
 
 
300
  Returns:
301
  str: Formatted text ready for LLM context
302
  """
303
+
304
  if github_result.get("status") != "success":
305
+ return "GitHub repositories could not be retrieved: " + \
306
+ f"{github_result.get('message', 'Unknown error')}"
307
+
308
  repositories = github_result.get("repositories", [])
309
  metadata = github_result.get("metadata", {})
310
+
311
  if not repositories:
312
  return f"No public repositories found for {metadata.get('username', 'user')}"
313
+
314
  formatted_parts = [
315
  "=== GITHUB REPOSITORIES ===\n",
316
  f"Profile: {metadata.get('profile_url', 'N/A')}",
317
  f"Username: {metadata.get('username', 'N/A')}",
318
  f"Public Repositories: {len(repositories)}\n"
319
  ]
320
+
321
  for i, repo in enumerate(repositories[:20], 1): # Limit to top 20 repos
322
  repo_info = [
323
  f"[REPOSITORY {i}]",
324
  f"Name: {repo['name']}",
325
  f"URL: {repo['html_url']}"
326
  ]
327
+
328
  if repo['description']:
329
  repo_info.append(f"Description: {repo['description']}")
330
+
331
  if repo['language']:
332
  repo_info.append(f"Primary Language: {repo['language']}")
333
+
334
  if repo['topics']:
335
  repo_info.append(f"Topics: {', '.join(repo['topics'][:5])}") # Limit topics
336
+
337
  repo_info.extend([
338
  f"Stars: {repo['stars']} | Forks: {repo['forks']}",
339
  f"Last Updated: {repo['updated_at'][:10]}", # Just the date
340
  "" # Empty line between repositories
341
  ])
342
+
343
  formatted_parts.extend(repo_info)
344
+
345
  if len(repositories) > 20:
346
  formatted_parts.append(f"... and {len(repositories) - 20} more repositories")
347
+
348
  formatted_parts.append("\n=== END GITHUB REPOSITORIES ===")
349
+
350
  return '\n'.join(formatted_parts)
functions/gradio.py CHANGED
@@ -31,55 +31,61 @@ def process_inputs(linkedin_pdf, github_url, job_post_text, user_instructions):
31
  result = ""
32
  extraction_result = None
33
  logger.info("Processing user inputs from Gradio interface")
34
-
35
  # Process LinkedIn PDF file
36
  if linkedin_pdf is not None:
37
  result += "✅ LinkedIn Resume PDF uploaded\n"
38
- logger.info(f"Processing LinkedIn PDF: {linkedin_pdf.name}")
39
-
40
  # Extract and structure text from the PDF
41
  extraction_result = extract_text_from_linkedin_pdf(linkedin_pdf.name)
42
-
43
  if extraction_result["status"] == "success":
44
  result += " ✅ Text extraction successful\n\n"
45
  logger.info("LinkedIn PDF text extraction successful")
46
 
47
  elif extraction_result["status"] == "warning":
48
  result += f" ⚠️ Text extraction: {extraction_result['message']}\n\n"
49
- logger.warning(f"LinkedIn PDF extraction warning: {extraction_result['message']}")
50
  else:
51
  result += f" ❌ Text extraction failed: {extraction_result['message']}\n\n"
52
- logger.error(f"LinkedIn PDF extraction failed: {extraction_result['message']}")
53
  else:
54
  result += "❌ No LinkedIn resume PDF file uploaded\n\n"
55
  logger.info("No LinkedIn PDF file provided")
56
-
57
  # Process GitHub profile
58
  if github_url and github_url.strip():
59
  result += "✅ GitHub Profile URL provided\n"
60
- logger.info(f"Processing GitHub URL: {github_url}")
61
-
62
  # Retrieve repositories from GitHub
63
  github_result = get_github_repositories(github_url)
64
-
65
  if github_result["status"] == "success":
66
  result += " ✅ GitHub list download successful\n\n"
67
- logger.info(f"GitHub repositories retrieved successfully for {github_result['metadata']['username']}")
68
-
 
 
 
69
  else:
70
  result += f" ❌ GitHub extraction failed: {github_result['message']}\n\n"
71
- logger.error(f"GitHub extraction failed: {github_result['message']}")
72
  else:
73
  result += "❌ No GitHub profile URL provided\n\n"
74
  logger.info("No GitHub URL provided")
75
-
76
  # Process job post text
77
  if job_post_text and job_post_text.strip():
78
  result += "✅ Job post text provided\n"
79
- logger.info(f"Job post text provided ({len(job_post_text)} characters)")
 
80
  summary = summarize_job_call(job_post_text)
81
- result += " ✅ Job post summary generated\n"
82
  result += summary
 
 
 
83
  else:
84
  result += "❌ Job post not provided\n"
85
  logger.info("No job post text provided")
@@ -87,7 +93,7 @@ def process_inputs(linkedin_pdf, github_url, job_post_text, user_instructions):
87
  # Process user instructions
88
  if user_instructions and user_instructions.strip():
89
  result += "✅ Additional instructions provided\n"
90
- logger.info(f"User instructions provided ({len(user_instructions)} characters)")
91
  else:
92
  result += "ℹ️ No additional instructions provided\n"
93
  logger.info("No additional instructions provided")
@@ -97,21 +103,22 @@ def process_inputs(linkedin_pdf, github_url, job_post_text, user_instructions):
97
  # Generate resume only if we have valid extraction result
98
  if extraction_result and extraction_result.get("status") == "success":
99
  try:
100
- resume = write_resume(extraction_result, user_instructions)
101
- result += f"\n✅ Resume generated successfully\n"
102
  logger.info("Resume generation completed successfully")
103
- except Exception as e:
 
104
  result += f"\n❌ Resume generation failed: {str(e)}\n"
105
- logger.error(f"Resume generation failed: {str(e)}")
106
  else:
107
- result += f"\n❌ Cannot generate resume: No valid LinkedIn data extracted\n"
108
  result += "Please ensure you upload a valid LinkedIn PDF export file.\n"
109
  logger.warning("Resume generation skipped - no valid LinkedIn data available")
110
-
111
  return result
112
 
113
 
114
- def get_processed_data(linkedin_pdf, github_url, job_post_text, user_instructions):
115
  """
116
  Get structured data from all inputs for further processing.
117
 
@@ -119,33 +126,41 @@ def get_processed_data(linkedin_pdf, github_url, job_post_text, user_instruction
119
  linkedin_pdf: Uploaded LinkedIn resume export PDF file
120
  github_url (str): GitHub profile URL
121
  job_post_text (str): Job post text content
122
- user_instructions (str): Additional instructions from the user
123
-
124
  Returns:
125
  dict: Structured data containing all processed information
126
  """
 
 
 
 
127
  processed_data = {
128
  "linkedin": None,
129
  "github": None,
130
- "job_post": job_post_text.strip() if job_post_text and job_post_text.strip() else None,
131
- "user_instructions": user_instructions.strip() if user_instructions and user_instructions.strip() else None,
132
  "errors": []
133
  }
134
-
135
  # Process LinkedIn PDF
136
  if linkedin_pdf is not None:
137
  extraction_result = extract_text_from_linkedin_pdf(linkedin_pdf.name)
 
138
  if extraction_result["status"] == "success":
139
  processed_data["linkedin"] = extraction_result
 
140
  else:
141
  processed_data["errors"].append(f"LinkedIn: {extraction_result['message']}")
142
-
143
  # Process GitHub profile
144
  if github_url and github_url.strip():
145
  github_result = get_github_repositories(github_url)
 
146
  if github_result["status"] == "success":
147
  processed_data["github"] = github_result
 
148
  else:
149
  processed_data["errors"].append(f"GitHub: {github_result['message']}")
150
-
151
  return processed_data
 
31
  result = ""
32
  extraction_result = None
33
  logger.info("Processing user inputs from Gradio interface")
34
+
35
  # Process LinkedIn PDF file
36
  if linkedin_pdf is not None:
37
  result += "✅ LinkedIn Resume PDF uploaded\n"
38
+ logger.info("Processing LinkedIn PDF: %s", linkedin_pdf.name)
39
+
40
  # Extract and structure text from the PDF
41
  extraction_result = extract_text_from_linkedin_pdf(linkedin_pdf.name)
42
+
43
  if extraction_result["status"] == "success":
44
  result += " ✅ Text extraction successful\n\n"
45
  logger.info("LinkedIn PDF text extraction successful")
46
 
47
  elif extraction_result["status"] == "warning":
48
  result += f" ⚠️ Text extraction: {extraction_result['message']}\n\n"
49
+ logger.warning("LinkedIn PDF extraction warning: %s", extraction_result['message'])
50
  else:
51
  result += f" ❌ Text extraction failed: {extraction_result['message']}\n\n"
52
+ logger.error("LinkedIn PDF extraction failed: %s", extraction_result['message'])
53
  else:
54
  result += "❌ No LinkedIn resume PDF file uploaded\n\n"
55
  logger.info("No LinkedIn PDF file provided")
56
+
57
  # Process GitHub profile
58
  if github_url and github_url.strip():
59
  result += "✅ GitHub Profile URL provided\n"
60
+ logger.info("Processing GitHub URL: %s", github_url)
61
+
62
  # Retrieve repositories from GitHub
63
  github_result = get_github_repositories(github_url)
64
+
65
  if github_result["status"] == "success":
66
  result += " ✅ GitHub list download successful\n\n"
67
+ logger.info(
68
+ "GitHub repositories retrieved successfully for %s",
69
+ github_result['metadata']['username']
70
+ )
71
+
72
  else:
73
  result += f" ❌ GitHub extraction failed: {github_result['message']}\n\n"
74
+ logger.error("GitHub extraction failed: %s", github_result['message'])
75
  else:
76
  result += "❌ No GitHub profile URL provided\n\n"
77
  logger.info("No GitHub URL provided")
78
+
79
  # Process job post text
80
  if job_post_text and job_post_text.strip():
81
  result += "✅ Job post text provided\n"
82
+ logger.info("Job post text provided (%d characters)", len(job_post_text))
83
+
84
  summary = summarize_job_call(job_post_text)
 
85
  result += summary
86
+ result += " ✅ Job post summary generated\n"
87
+ logger.info("Job post summary generated (%d characters)", len(summary))
88
+
89
  else:
90
  result += "❌ Job post not provided\n"
91
  logger.info("No job post text provided")
 
93
  # Process user instructions
94
  if user_instructions and user_instructions.strip():
95
  result += "✅ Additional instructions provided\n"
96
+ logger.info("User instructions provided (%d characters)", len(user_instructions))
97
  else:
98
  result += "ℹ️ No additional instructions provided\n"
99
  logger.info("No additional instructions provided")
 
103
  # Generate resume only if we have valid extraction result
104
  if extraction_result and extraction_result.get("status") == "success":
105
  try:
106
+ _ = write_resume(extraction_result, user_instructions)
107
+ result += "\n✅ Resume generated successfully\n"
108
  logger.info("Resume generation completed successfully")
109
+
110
+ except Exception as e: # pylint: disable=broad-exception-caught
111
  result += f"\n❌ Resume generation failed: {str(e)}\n"
112
+ logger.error("Resume generation failed: %s", str(e))
113
  else:
114
+ result += "\n❌ Cannot generate resume: No valid LinkedIn data extracted\n"
115
  result += "Please ensure you upload a valid LinkedIn PDF export file.\n"
116
  logger.warning("Resume generation skipped - no valid LinkedIn data available")
117
+
118
  return result
119
 
120
 
121
+ def get_processed_data(linkedin_pdf, github_url, job_post_text, instructions):
122
  """
123
  Get structured data from all inputs for further processing.
124
 
 
126
  linkedin_pdf: Uploaded LinkedIn resume export PDF file
127
  github_url (str): GitHub profile URL
128
  job_post_text (str): Job post text content
129
+ instructions (str): Additional instructions from the user
130
+
131
  Returns:
132
  dict: Structured data containing all processed information
133
  """
134
+
135
+ job_post_text = job_post_text.strip() if job_post_text and job_post_text.strip() else None
136
+ instructions = instructions.strip() if instructions and instructions.strip() else None
137
+
138
  processed_data = {
139
  "linkedin": None,
140
  "github": None,
141
+ "job_post": job_post_text,
142
+ "user_instructions": instructions,
143
  "errors": []
144
  }
145
+
146
  # Process LinkedIn PDF
147
  if linkedin_pdf is not None:
148
  extraction_result = extract_text_from_linkedin_pdf(linkedin_pdf.name)
149
+
150
  if extraction_result["status"] == "success":
151
  processed_data["linkedin"] = extraction_result
152
+
153
  else:
154
  processed_data["errors"].append(f"LinkedIn: {extraction_result['message']}")
155
+
156
  # Process GitHub profile
157
  if github_url and github_url.strip():
158
  github_result = get_github_repositories(github_url)
159
+
160
  if github_result["status"] == "success":
161
  processed_data["github"] = github_result
162
+
163
  else:
164
  processed_data["errors"].append(f"GitHub: {github_result['message']}")
165
+
166
  return processed_data
functions/job_call.py CHANGED
@@ -1,9 +1,15 @@
1
  '''Functions for summarizing and formatting job calls.'''
2
 
3
  import os
 
4
  from openai import OpenAI
5
  from configuration import JOB_CALL_EXTRACTION_PROMPT
6
 
 
 
 
 
 
7
  def summarize_job_call(job_call: str) -> str:
8
  '''Extracts and summarizes key information from job call.'''
9
 
@@ -51,4 +57,4 @@ def summarize_job_call(job_call: str) -> str:
51
  else:
52
  summary = None
53
 
54
- return summary
 
1
  '''Functions for summarizing and formatting job calls.'''
2
 
3
  import os
4
+ import logging
5
  from openai import OpenAI
6
  from configuration import JOB_CALL_EXTRACTION_PROMPT
7
 
8
+ # Set up logging
9
+ logging.basicConfig(level=logging.INFO)
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
  def summarize_job_call(job_call: str) -> str:
14
  '''Extracts and summarizes key information from job call.'''
15
 
 
57
  else:
58
  summary = None
59
 
60
+ return summary
functions/linkedin_resume.py CHANGED
@@ -9,6 +9,8 @@ import re
9
  import logging
10
  import io
11
  import os
 
 
12
  import PyPDF2
13
 
14
  # Set up logging
@@ -41,35 +43,37 @@ def extract_text_from_linkedin_pdf(pdf_file) -> dict:
41
  """
42
  if pdf_file is None:
43
  return {"status": "error", "message": "No PDF file provided"}
44
-
45
  try:
46
  # Get filename from path
47
  filename = os.path.basename(pdf_file)
48
-
49
  # Read the PDF file from the file path
50
  with open(pdf_file, 'rb') as file:
51
  file_content = file.read()
52
  file_size = len(file_content)
53
-
54
  # Create PDF reader from the file content
55
  pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_content))
56
-
57
  # Extract text from all pages
58
  extracted_text = ""
59
  num_pages = len(pdf_reader.pages)
60
-
61
  for page_num in range(num_pages):
62
  try:
63
  page = pdf_reader.pages[page_num]
64
  page_text = page.extract_text()
65
  extracted_text += page_text + "\n\n"
66
- except Exception as e:
67
- logger.warning(f"Error extracting text from page {page_num + 1}: {str(e)}")
 
 
68
  continue
69
-
70
  # Clean and structure the extracted text for LLM consumption
71
  structured_content = _structure_resume_text(extracted_text)
72
-
73
  if not structured_content["full_text"].strip():
74
  return {
75
  "status": "warning",
@@ -81,10 +85,14 @@ def extract_text_from_linkedin_pdf(pdf_file) -> dict:
81
  },
82
  "message": "PDF processed but no text content was extracted"
83
  }
84
-
85
- logger.info(f"Successfully extracted and structured {len(structured_content['full_text'])} characters from {filename}")
86
-
87
- return {
 
 
 
 
88
  "status": "success",
89
  "structured_text": structured_content,
90
  "metadata": {
@@ -95,9 +103,26 @@ def extract_text_from_linkedin_pdf(pdf_file) -> dict:
95
  },
96
  "message": f"Text extracted and structured successfully from {num_pages} pages"
97
  }
98
-
99
- except Exception as e:
100
- logger.error(f"Error processing PDF file: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  return {
102
  "status": "error",
103
  "message": f"Failed to extract text from PDF: {str(e)}"
@@ -124,10 +149,10 @@ def _structure_resume_text(text: str) -> dict:
124
  "word_count": 0,
125
  "section_count": 0
126
  }
127
-
128
  # Clean the text first
129
  cleaned_text = _clean_extracted_text(text)
130
-
131
  # Define section patterns (common LinkedIn export sections)
132
  section_patterns = {
133
  "contact_info": r"(?i)(contact|personal|profile)\s*(?:information)?",
@@ -137,56 +162,66 @@ def _structure_resume_text(text: str) -> dict:
137
  "skills": r"(?i)(skills|competencies|technologies|technical)",
138
  "certifications": r"(?i)(certification|certificate|license)",
139
  }
140
-
141
  # Split text into lines for processing
142
  lines = cleaned_text.split('\n')
143
  sections = {}
144
  current_section = "general"
145
  current_content = []
146
-
147
  for line in lines:
148
  line = line.strip()
 
149
  if not line:
150
  continue
151
-
152
  # Check if line is a section header
153
  section_found = None
 
154
  for section_name, pattern in section_patterns.items():
155
  if re.match(pattern, line):
 
156
  section_found = section_name
157
  break
158
-
159
  if section_found:
 
160
  # Save previous section content
161
  if current_content:
162
  sections[current_section] = '\n'.join(current_content)
163
-
164
  # Start new section
165
  current_section = section_found
166
  current_content = [line]
 
167
  else:
168
  current_content.append(line)
169
-
170
  # Save the last section
171
  if current_content:
172
  sections[current_section] = '\n'.join(current_content)
173
-
174
  # Create a structured summary for LLM context
175
  summary_parts = []
 
176
  if "contact_info" in sections:
177
  summary_parts.append(f"CONTACT: {sections['contact_info'][:200]}...")
 
178
  if "summary" in sections:
179
  summary_parts.append(f"SUMMARY: {sections['summary']}")
 
180
  if "experience" in sections:
181
  summary_parts.append(f"EXPERIENCE: {sections['experience'][:300]}...")
 
182
  if "education" in sections:
183
  summary_parts.append(f"EDUCATION: {sections['education']}")
 
184
  if "skills" in sections:
185
  summary_parts.append(f"SKILLS: {sections['skills']}")
186
-
187
  # Create LLM-optimized format
188
- llm_formatted_text = _format_for_llm(sections, cleaned_text)
189
-
190
  return {
191
  "sections": sections,
192
  "full_text": cleaned_text,
@@ -198,7 +233,7 @@ def _structure_resume_text(text: str) -> dict:
198
  }
199
 
200
 
201
- def _format_for_llm(sections: dict, full_text: str) -> str:
202
  """
203
  Format the resume sections in an optimal way for LLM processing.
204
 
@@ -210,32 +245,35 @@ def _format_for_llm(sections: dict, full_text: str) -> str:
210
  str: LLM-optimized formatted text
211
  """
212
  formatted_parts = ["=== RESUME CONTENT ===\n"]
213
-
214
  # Prioritize sections in logical order for LLM
215
- priority_order = ["summary", "contact_info", "experience", "education", "skills",
216
  "certifications", "projects", "achievements", "languages", "volunteer"]
217
-
218
  # Add prioritized sections
219
  for section_name in priority_order:
220
  if section_name in sections:
 
221
  formatted_parts.append(f"[{section_name.upper().replace('_', ' ')}]")
222
  formatted_parts.append(sections[section_name])
223
  formatted_parts.append("") # Empty line between sections
224
-
225
  # Add any remaining sections
226
  for section_name, content in sections.items():
227
  if section_name not in priority_order and section_name != "general":
 
228
  formatted_parts.append(f"[{section_name.upper().replace('_', ' ')}]")
229
  formatted_parts.append(content)
230
  formatted_parts.append("")
231
-
232
  # Add general content if exists
233
  if "general" in sections:
 
234
  formatted_parts.append("[ADDITIONAL INFORMATION]")
235
  formatted_parts.append(sections["general"])
236
-
237
  formatted_parts.append("\n=== END RESUME ===")
238
-
239
  return '\n'.join(formatted_parts)
240
 
241
 
@@ -251,39 +289,40 @@ def _clean_extracted_text(text: str) -> str:
251
  """
252
  if not text:
253
  return ""
254
-
255
  # Remove excessive whitespace and normalize line endings
256
  text = re.sub(r'\r\n', '\n', text)
257
  text = re.sub(r'\r', '\n', text)
258
-
259
  # Split into lines and clean each line
260
  lines = text.split('\n')
261
  cleaned_lines = []
262
-
263
  for line in lines:
 
264
  # Strip whitespace
265
  cleaned_line = line.strip()
266
-
267
  # Skip empty lines and very short lines (likely artifacts)
268
  if len(cleaned_line) < 2:
269
  continue
270
-
271
  # Remove common PDF artifacts
272
  cleaned_line = re.sub(r'^\d+$', '', cleaned_line) # Page numbers
273
  cleaned_line = re.sub(r'^[|\-_=]+$', '', cleaned_line) # Separator lines
274
-
275
  if cleaned_line:
276
  cleaned_lines.append(cleaned_line)
277
-
278
  # Join lines and normalize spacing
279
  cleaned_text = '\n'.join(cleaned_lines)
280
-
281
  # Normalize multiple spaces to single spaces
282
  cleaned_text = re.sub(r' +', ' ', cleaned_text)
283
-
284
  # Normalize multiple newlines to maximum of 2
285
  cleaned_text = re.sub(r'\n{3,}', '\n\n', cleaned_text)
286
-
287
  return cleaned_text.strip()
288
 
289
 
@@ -299,8 +338,8 @@ def get_llm_context_from_resume(extraction_result: dict) -> str:
299
  """
300
  if extraction_result.get("status") != "success":
301
  return ""
302
-
303
  structured_text = extraction_result.get("structured_text", {})
304
-
305
  # Return the LLM-formatted version if available, otherwise fall back to full text
306
  return structured_text.get("llm_formatted", structured_text.get("full_text", ""))
 
9
  import logging
10
  import io
11
  import os
12
+ import json
13
+ from pathlib import Path
14
  import PyPDF2
15
 
16
  # Set up logging
 
43
  """
44
  if pdf_file is None:
45
  return {"status": "error", "message": "No PDF file provided"}
46
+
47
  try:
48
  # Get filename from path
49
  filename = os.path.basename(pdf_file)
50
+
51
  # Read the PDF file from the file path
52
  with open(pdf_file, 'rb') as file:
53
  file_content = file.read()
54
  file_size = len(file_content)
55
+
56
  # Create PDF reader from the file content
57
  pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_content))
58
+
59
  # Extract text from all pages
60
  extracted_text = ""
61
  num_pages = len(pdf_reader.pages)
62
+
63
  for page_num in range(num_pages):
64
  try:
65
  page = pdf_reader.pages[page_num]
66
  page_text = page.extract_text()
67
  extracted_text += page_text + "\n\n"
68
+
69
+ except Exception as e: # pylint: disable=broad-exception-caught
70
+ logger.warning("Error extracting text from page %d: %s", page_num + 1, str(e))
71
+
72
  continue
73
+
74
  # Clean and structure the extracted text for LLM consumption
75
  structured_content = _structure_resume_text(extracted_text)
76
+
77
  if not structured_content["full_text"].strip():
78
  return {
79
  "status": "warning",
 
85
  },
86
  "message": "PDF processed but no text content was extracted"
87
  }
88
+
89
+ logger.info(
90
+ "Successfully extracted and structured %d characters from %s",
91
+ len(structured_content['full_text']),
92
+ filename
93
+ )
94
+
95
+ result = {
96
  "status": "success",
97
  "structured_text": structured_content,
98
  "metadata": {
 
103
  },
104
  "message": f"Text extracted and structured successfully from {num_pages} pages"
105
  }
106
+
107
+ # Save results to JSON file
108
+ try:
109
+ data_dir = Path(__file__).parent.parent / "data"
110
+ data_dir.mkdir(exist_ok=True)
111
+
112
+ output_file = data_dir / "linkedin_resume.json"
113
+ with open(output_file, 'w', encoding='utf-8') as f:
114
+ json.dump(result, f, indent=2, ensure_ascii=False)
115
+
116
+ logger.info("LinkedIn resume extraction saved to %s", output_file)
117
+
118
+ except Exception as save_error: # pylint: disable=broad-exception-caught
119
+ logger.warning("Failed to save LinkedIn resume extraction to file: %s", str(save_error))
120
+
121
+ return result
122
+
123
+ except Exception as e: # pylint: disable=broad-exception-caught
124
+ logger.error("Error processing PDF file: %s", str(e))
125
+
126
  return {
127
  "status": "error",
128
  "message": f"Failed to extract text from PDF: {str(e)}"
 
149
  "word_count": 0,
150
  "section_count": 0
151
  }
152
+
153
  # Clean the text first
154
  cleaned_text = _clean_extracted_text(text)
155
+
156
  # Define section patterns (common LinkedIn export sections)
157
  section_patterns = {
158
  "contact_info": r"(?i)(contact|personal|profile)\s*(?:information)?",
 
162
  "skills": r"(?i)(skills|competencies|technologies|technical)",
163
  "certifications": r"(?i)(certification|certificate|license)",
164
  }
165
+
166
  # Split text into lines for processing
167
  lines = cleaned_text.split('\n')
168
  sections = {}
169
  current_section = "general"
170
  current_content = []
171
+
172
  for line in lines:
173
  line = line.strip()
174
+
175
  if not line:
176
  continue
177
+
178
  # Check if line is a section header
179
  section_found = None
180
+
181
  for section_name, pattern in section_patterns.items():
182
  if re.match(pattern, line):
183
+
184
  section_found = section_name
185
  break
186
+
187
  if section_found:
188
+
189
  # Save previous section content
190
  if current_content:
191
  sections[current_section] = '\n'.join(current_content)
192
+
193
  # Start new section
194
  current_section = section_found
195
  current_content = [line]
196
+
197
  else:
198
  current_content.append(line)
199
+
200
  # Save the last section
201
  if current_content:
202
  sections[current_section] = '\n'.join(current_content)
203
+
204
  # Create a structured summary for LLM context
205
  summary_parts = []
206
+
207
  if "contact_info" in sections:
208
  summary_parts.append(f"CONTACT: {sections['contact_info'][:200]}...")
209
+
210
  if "summary" in sections:
211
  summary_parts.append(f"SUMMARY: {sections['summary']}")
212
+
213
  if "experience" in sections:
214
  summary_parts.append(f"EXPERIENCE: {sections['experience'][:300]}...")
215
+
216
  if "education" in sections:
217
  summary_parts.append(f"EDUCATION: {sections['education']}")
218
+
219
  if "skills" in sections:
220
  summary_parts.append(f"SKILLS: {sections['skills']}")
221
+
222
  # Create LLM-optimized format
223
+ llm_formatted_text = _format_for_llm(sections)
224
+
225
  return {
226
  "sections": sections,
227
  "full_text": cleaned_text,
 
233
  }
234
 
235
 
236
+ def _format_for_llm(sections: dict) -> str:
237
  """
238
  Format the resume sections in an optimal way for LLM processing.
239
 
 
245
  str: LLM-optimized formatted text
246
  """
247
  formatted_parts = ["=== RESUME CONTENT ===\n"]
248
+
249
  # Prioritize sections in logical order for LLM
250
+ priority_order = ["summary", "contact_info", "experience", "education", "skills",
251
  "certifications", "projects", "achievements", "languages", "volunteer"]
252
+
253
  # Add prioritized sections
254
  for section_name in priority_order:
255
  if section_name in sections:
256
+
257
  formatted_parts.append(f"[{section_name.upper().replace('_', ' ')}]")
258
  formatted_parts.append(sections[section_name])
259
  formatted_parts.append("") # Empty line between sections
260
+
261
  # Add any remaining sections
262
  for section_name, content in sections.items():
263
  if section_name not in priority_order and section_name != "general":
264
+
265
  formatted_parts.append(f"[{section_name.upper().replace('_', ' ')}]")
266
  formatted_parts.append(content)
267
  formatted_parts.append("")
268
+
269
  # Add general content if exists
270
  if "general" in sections:
271
+
272
  formatted_parts.append("[ADDITIONAL INFORMATION]")
273
  formatted_parts.append(sections["general"])
274
+
275
  formatted_parts.append("\n=== END RESUME ===")
276
+
277
  return '\n'.join(formatted_parts)
278
 
279
 
 
289
  """
290
  if not text:
291
  return ""
292
+
293
  # Remove excessive whitespace and normalize line endings
294
  text = re.sub(r'\r\n', '\n', text)
295
  text = re.sub(r'\r', '\n', text)
296
+
297
  # Split into lines and clean each line
298
  lines = text.split('\n')
299
  cleaned_lines = []
300
+
301
  for line in lines:
302
+
303
  # Strip whitespace
304
  cleaned_line = line.strip()
305
+
306
  # Skip empty lines and very short lines (likely artifacts)
307
  if len(cleaned_line) < 2:
308
  continue
309
+
310
  # Remove common PDF artifacts
311
  cleaned_line = re.sub(r'^\d+$', '', cleaned_line) # Page numbers
312
  cleaned_line = re.sub(r'^[|\-_=]+$', '', cleaned_line) # Separator lines
313
+
314
  if cleaned_line:
315
  cleaned_lines.append(cleaned_line)
316
+
317
  # Join lines and normalize spacing
318
  cleaned_text = '\n'.join(cleaned_lines)
319
+
320
  # Normalize multiple spaces to single spaces
321
  cleaned_text = re.sub(r' +', ' ', cleaned_text)
322
+
323
  # Normalize multiple newlines to maximum of 2
324
  cleaned_text = re.sub(r'\n{3,}', '\n\n', cleaned_text)
325
+
326
  return cleaned_text.strip()
327
 
328
 
 
338
  """
339
  if extraction_result.get("status") != "success":
340
  return ""
341
+
342
  structured_text = extraction_result.get("structured_text", {})
343
+
344
  # Return the LLM-formatted version if available, otherwise fall back to full text
345
  return structured_text.get("llm_formatted", structured_text.get("full_text", ""))
functions/writer_agent.py CHANGED
@@ -23,7 +23,7 @@ def write_resume(content: str, user_instructions: str = None) -> str:
23
  """
24
 
25
  if content['status'] == 'success':
26
-
27
  agent = CodeAgent(
28
  model=AGENT_MODEL,
29
  tools=[],
@@ -36,7 +36,9 @@ def write_resume(content: str, user_instructions: str = None) -> str:
36
 
37
  # Prepare instructions - combine default with user instructions
38
  instructions = INSTRUCTIONS
 
39
  if user_instructions and user_instructions.strip():
 
40
  instructions += f"\n\nAdditional user instructions:\n{user_instructions.strip()}"
41
  logger.info("Added user instructions to agent prompt")
42
 
@@ -44,21 +46,26 @@ def write_resume(content: str, user_instructions: str = None) -> str:
44
  instructions + '\n' + json.dumps(content['structured_text']),
45
  )
46
 
47
- logger.info("submitted_answer: %s", submitted_answer)
48
-
49
  # Create data directory if it doesn't exist
50
  data_dir = 'data'
 
51
  if not os.path.exists(data_dir):
 
52
  os.makedirs(data_dir)
53
  logger.info("Created data directory: %s", data_dir)
54
-
55
  # Save the resume to resume.md in the data directory
56
  resume_file_path = os.path.join(data_dir, 'resume.md')
 
57
  try:
58
  with open(resume_file_path, 'w', encoding='utf-8') as f:
59
  f.write(submitted_answer)
 
60
  logger.info("Resume saved to: %s", resume_file_path)
61
- except Exception as e:
 
62
  logger.error("Failed to save resume to file: %s", e)
63
-
64
- return submitted_answer
 
23
  """
24
 
25
  if content['status'] == 'success':
26
+
27
  agent = CodeAgent(
28
  model=AGENT_MODEL,
29
  tools=[],
 
36
 
37
  # Prepare instructions - combine default with user instructions
38
  instructions = INSTRUCTIONS
39
+
40
  if user_instructions and user_instructions.strip():
41
+
42
  instructions += f"\n\nAdditional user instructions:\n{user_instructions.strip()}"
43
  logger.info("Added user instructions to agent prompt")
44
 
 
46
  instructions + '\n' + json.dumps(content['structured_text']),
47
  )
48
 
49
+ logger.info("submitted_answer: %s", submitted_answer)
50
+
51
  # Create data directory if it doesn't exist
52
  data_dir = 'data'
53
+
54
  if not os.path.exists(data_dir):
55
+
56
  os.makedirs(data_dir)
57
  logger.info("Created data directory: %s", data_dir)
58
+
59
  # Save the resume to resume.md in the data directory
60
  resume_file_path = os.path.join(data_dir, 'resume.md')
61
+
62
  try:
63
  with open(resume_file_path, 'w', encoding='utf-8') as f:
64
  f.write(submitted_answer)
65
+
66
  logger.info("Resume saved to: %s", resume_file_path)
67
+
68
+ except Exception as e: # pylint: disable=broad-exception-caught
69
  logger.error("Failed to save resume to file: %s", e)
70
+
71
+ return submitted_answer