Spaces:
Configuration error
Configuration error
General clean up, added save of results to linkedin resume parsing and github repo list retreival.
Browse files- functions/github.py +85 -51
- functions/gradio.py +46 -31
- functions/job_call.py +7 -1
- functions/linkedin_resume.py +87 -48
- functions/writer_agent.py +14 -7
functions/github.py
CHANGED
@@ -5,9 +5,12 @@ Functions for retrieving information from GitHub profiles and repositories.
|
|
5 |
"""
|
6 |
|
7 |
import re
|
|
|
8 |
import logging
|
9 |
-
import requests
|
10 |
from typing import List, Dict, Optional
|
|
|
|
|
|
|
11 |
|
12 |
# Set up logging
|
13 |
logging.basicConfig(level=logging.INFO)
|
@@ -49,29 +52,32 @@ def get_github_repositories(github_url: str) -> Dict:
|
|
49 |
"""
|
50 |
if not github_url or not github_url.strip():
|
51 |
return {"status": "error", "message": "No GitHub URL provided"}
|
52 |
-
|
53 |
try:
|
54 |
# Extract username from GitHub URL
|
55 |
username = _extract_github_username(github_url)
|
|
|
56 |
if not username:
|
57 |
return {"status": "error", "message": "Invalid GitHub URL format"}
|
58 |
-
|
59 |
-
logger.info(
|
60 |
-
|
61 |
# Get user info first
|
62 |
user_info = _get_github_user_info(username)
|
|
|
63 |
if user_info["status"] != "success":
|
64 |
return user_info
|
65 |
-
|
66 |
# Get repositories
|
67 |
repositories = _get_user_repositories(username)
|
|
|
68 |
if repositories["status"] != "success":
|
69 |
return repositories
|
70 |
-
|
71 |
# Process and structure repository data
|
72 |
processed_repos = _process_repository_data(repositories["data"])
|
73 |
-
|
74 |
-
|
75 |
"status": "success",
|
76 |
"repositories": processed_repos,
|
77 |
"metadata": {
|
@@ -82,9 +88,25 @@ def get_github_repositories(github_url: str) -> Dict:
|
|
82 |
},
|
83 |
"message": f"Successfully retrieved {len(processed_repos)} repositories"
|
84 |
}
|
85 |
-
|
86 |
-
|
87 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
return {
|
89 |
"status": "error",
|
90 |
"message": f"Failed to retrieve GitHub repositories: {str(e)}"
|
@@ -104,26 +126,29 @@ def _extract_github_username(github_url: str) -> Optional[str]:
|
|
104 |
try:
|
105 |
# Clean up the URL
|
106 |
url = github_url.strip().rstrip('/')
|
107 |
-
|
108 |
# Handle various GitHub URL formats
|
109 |
patterns = [
|
110 |
r'github\.com/([^/]+)/?$', # https://github.com/username
|
111 |
r'github\.com/([^/]+)/.*', # https://github.com/username/anything
|
112 |
r'^([a-zA-Z0-9\-_]+)$' # Just username
|
113 |
]
|
114 |
-
|
115 |
for pattern in patterns:
|
116 |
match = re.search(pattern, url)
|
|
|
117 |
if match:
|
118 |
username = match.group(1)
|
|
|
119 |
# Validate username format
|
120 |
if re.match(r'^[a-zA-Z0-9\-_]+$', username) and len(username) <= 39:
|
121 |
return username
|
122 |
-
|
123 |
return None
|
124 |
-
|
125 |
-
except Exception as e:
|
126 |
-
logger.warning(
|
|
|
127 |
return None
|
128 |
|
129 |
|
@@ -143,20 +168,23 @@ def _get_github_user_info(username: str) -> Dict:
|
|
143 |
"Accept": "application/vnd.github.v3+json",
|
144 |
"User-Agent": "Resumate-App/1.0"
|
145 |
}
|
146 |
-
|
147 |
response = requests.get(url, headers=headers, timeout=10)
|
148 |
-
|
149 |
if response.status_code == 404:
|
150 |
return {"status": "error", "message": f"GitHub user '{username}' not found"}
|
|
|
151 |
elif response.status_code == 403:
|
152 |
return {"status": "error", "message": "GitHub API rate limit exceeded"}
|
|
|
153 |
elif response.status_code != 200:
|
154 |
return {"status": "error", "message": f"GitHub API error: {response.status_code}"}
|
155 |
-
|
156 |
return {"status": "success", "data": response.json()}
|
157 |
-
|
158 |
except requests.RequestException as e:
|
159 |
-
logger.error(
|
|
|
160 |
return {"status": "error", "message": f"Network error: {str(e)}"}
|
161 |
|
162 |
|
@@ -175,8 +203,9 @@ def _get_user_repositories(username: str) -> Dict:
|
|
175 |
all_repos = []
|
176 |
page = 1
|
177 |
per_page = 100 # Maximum allowed by GitHub API
|
178 |
-
|
179 |
while True:
|
|
|
180 |
url = f"https://api.github.com/users/{username}/repos"
|
181 |
params = {
|
182 |
"type": "public",
|
@@ -189,32 +218,34 @@ def _get_user_repositories(username: str) -> Dict:
|
|
189 |
"Accept": "application/vnd.github.v3+json",
|
190 |
"User-Agent": "Resumate-App/1.0"
|
191 |
}
|
192 |
-
|
193 |
response = requests.get(url, headers=headers, params=params, timeout=10)
|
194 |
-
|
195 |
if response.status_code != 200:
|
196 |
return {"status": "error", "message": f"GitHub API error: {response.status_code}"}
|
197 |
-
|
198 |
repos = response.json()
|
|
|
199 |
if not repos: # No more repositories
|
200 |
break
|
201 |
-
|
202 |
all_repos.extend(repos)
|
203 |
-
|
204 |
# If we got less than per_page, we've reached the end
|
205 |
if len(repos) < per_page:
|
206 |
break
|
207 |
-
|
208 |
page += 1
|
209 |
-
|
210 |
# Safety limit to prevent infinite loops
|
211 |
if page > 10: # Max 1000 repos
|
212 |
break
|
213 |
-
|
214 |
return {"status": "success", "data": all_repos}
|
215 |
-
|
216 |
except requests.RequestException as e:
|
217 |
-
logger.error(
|
|
|
218 |
return {"status": "error", "message": f"Network error: {str(e)}"}
|
219 |
|
220 |
|
@@ -229,12 +260,13 @@ def _process_repository_data(repos: List[Dict]) -> List[Dict]:
|
|
229 |
List[Dict]: Processed repository data
|
230 |
"""
|
231 |
processed = []
|
232 |
-
|
233 |
for repo in repos:
|
|
|
234 |
# Skip forks unless they have significant modifications
|
235 |
if repo.get("fork", False) and repo.get("stargazers_count", 0) == 0:
|
236 |
continue
|
237 |
-
|
238 |
processed_repo = {
|
239 |
"name": repo.get("name", ""),
|
240 |
"description": repo.get("description", ""),
|
@@ -252,9 +284,9 @@ def _process_repository_data(repos: List[Dict]) -> List[Dict]:
|
|
252 |
"has_wiki": repo.get("has_wiki", False),
|
253 |
"has_pages": repo.get("has_pages", False)
|
254 |
}
|
255 |
-
|
256 |
processed.append(processed_repo)
|
257 |
-
|
258 |
return processed
|
259 |
|
260 |
|
@@ -268,49 +300,51 @@ def format_repositories_for_llm(github_result: Dict) -> str:
|
|
268 |
Returns:
|
269 |
str: Formatted text ready for LLM context
|
270 |
"""
|
|
|
271 |
if github_result.get("status") != "success":
|
272 |
-
return
|
273 |
-
|
|
|
274 |
repositories = github_result.get("repositories", [])
|
275 |
metadata = github_result.get("metadata", {})
|
276 |
-
|
277 |
if not repositories:
|
278 |
return f"No public repositories found for {metadata.get('username', 'user')}"
|
279 |
-
|
280 |
formatted_parts = [
|
281 |
"=== GITHUB REPOSITORIES ===\n",
|
282 |
f"Profile: {metadata.get('profile_url', 'N/A')}",
|
283 |
f"Username: {metadata.get('username', 'N/A')}",
|
284 |
f"Public Repositories: {len(repositories)}\n"
|
285 |
]
|
286 |
-
|
287 |
for i, repo in enumerate(repositories[:20], 1): # Limit to top 20 repos
|
288 |
repo_info = [
|
289 |
f"[REPOSITORY {i}]",
|
290 |
f"Name: {repo['name']}",
|
291 |
f"URL: {repo['html_url']}"
|
292 |
]
|
293 |
-
|
294 |
if repo['description']:
|
295 |
repo_info.append(f"Description: {repo['description']}")
|
296 |
-
|
297 |
if repo['language']:
|
298 |
repo_info.append(f"Primary Language: {repo['language']}")
|
299 |
-
|
300 |
if repo['topics']:
|
301 |
repo_info.append(f"Topics: {', '.join(repo['topics'][:5])}") # Limit topics
|
302 |
-
|
303 |
repo_info.extend([
|
304 |
f"Stars: {repo['stars']} | Forks: {repo['forks']}",
|
305 |
f"Last Updated: {repo['updated_at'][:10]}", # Just the date
|
306 |
"" # Empty line between repositories
|
307 |
])
|
308 |
-
|
309 |
formatted_parts.extend(repo_info)
|
310 |
-
|
311 |
if len(repositories) > 20:
|
312 |
formatted_parts.append(f"... and {len(repositories) - 20} more repositories")
|
313 |
-
|
314 |
formatted_parts.append("\n=== END GITHUB REPOSITORIES ===")
|
315 |
-
|
316 |
return '\n'.join(formatted_parts)
|
|
|
5 |
"""
|
6 |
|
7 |
import re
|
8 |
+
import json
|
9 |
import logging
|
|
|
10 |
from typing import List, Dict, Optional
|
11 |
+
from pathlib import Path
|
12 |
+
|
13 |
+
import requests
|
14 |
|
15 |
# Set up logging
|
16 |
logging.basicConfig(level=logging.INFO)
|
|
|
52 |
"""
|
53 |
if not github_url or not github_url.strip():
|
54 |
return {"status": "error", "message": "No GitHub URL provided"}
|
55 |
+
|
56 |
try:
|
57 |
# Extract username from GitHub URL
|
58 |
username = _extract_github_username(github_url)
|
59 |
+
|
60 |
if not username:
|
61 |
return {"status": "error", "message": "Invalid GitHub URL format"}
|
62 |
+
|
63 |
+
logger.info("Fetching repositories for GitHub user: %s", username)
|
64 |
+
|
65 |
# Get user info first
|
66 |
user_info = _get_github_user_info(username)
|
67 |
+
|
68 |
if user_info["status"] != "success":
|
69 |
return user_info
|
70 |
+
|
71 |
# Get repositories
|
72 |
repositories = _get_user_repositories(username)
|
73 |
+
|
74 |
if repositories["status"] != "success":
|
75 |
return repositories
|
76 |
+
|
77 |
# Process and structure repository data
|
78 |
processed_repos = _process_repository_data(repositories["data"])
|
79 |
+
|
80 |
+
result = {
|
81 |
"status": "success",
|
82 |
"repositories": processed_repos,
|
83 |
"metadata": {
|
|
|
88 |
},
|
89 |
"message": f"Successfully retrieved {len(processed_repos)} repositories"
|
90 |
}
|
91 |
+
|
92 |
+
# Save results to JSON file
|
93 |
+
try:
|
94 |
+
data_dir = Path(__file__).parent.parent / "data"
|
95 |
+
data_dir.mkdir(exist_ok=True)
|
96 |
+
|
97 |
+
output_file = data_dir / "github_repos.json"
|
98 |
+
with open(output_file, 'w', encoding='utf-8') as f:
|
99 |
+
json.dump(result, f, indent=2, ensure_ascii=False)
|
100 |
+
|
101 |
+
logger.info("GitHub repositories saved to %s", output_file)
|
102 |
+
except Exception as save_error: # pylint: disable=broad-exception-caught
|
103 |
+
logger.warning("Failed to save GitHub repositories to file: %s", str(save_error))
|
104 |
+
|
105 |
+
return result
|
106 |
+
|
107 |
+
except Exception as e: # pylint: disable=broad-exception-caught
|
108 |
+
logger.error("Error retrieving GitHub repositories: %s", str(e))
|
109 |
+
|
110 |
return {
|
111 |
"status": "error",
|
112 |
"message": f"Failed to retrieve GitHub repositories: {str(e)}"
|
|
|
126 |
try:
|
127 |
# Clean up the URL
|
128 |
url = github_url.strip().rstrip('/')
|
129 |
+
|
130 |
# Handle various GitHub URL formats
|
131 |
patterns = [
|
132 |
r'github\.com/([^/]+)/?$', # https://github.com/username
|
133 |
r'github\.com/([^/]+)/.*', # https://github.com/username/anything
|
134 |
r'^([a-zA-Z0-9\-_]+)$' # Just username
|
135 |
]
|
136 |
+
|
137 |
for pattern in patterns:
|
138 |
match = re.search(pattern, url)
|
139 |
+
|
140 |
if match:
|
141 |
username = match.group(1)
|
142 |
+
|
143 |
# Validate username format
|
144 |
if re.match(r'^[a-zA-Z0-9\-_]+$', username) and len(username) <= 39:
|
145 |
return username
|
146 |
+
|
147 |
return None
|
148 |
+
|
149 |
+
except Exception as e: # pylint: disable=broad-exception-caught
|
150 |
+
logger.warning("Error extracting username from URL %s: %s", github_url, str(e))
|
151 |
+
|
152 |
return None
|
153 |
|
154 |
|
|
|
168 |
"Accept": "application/vnd.github.v3+json",
|
169 |
"User-Agent": "Resumate-App/1.0"
|
170 |
}
|
171 |
+
|
172 |
response = requests.get(url, headers=headers, timeout=10)
|
173 |
+
|
174 |
if response.status_code == 404:
|
175 |
return {"status": "error", "message": f"GitHub user '{username}' not found"}
|
176 |
+
|
177 |
elif response.status_code == 403:
|
178 |
return {"status": "error", "message": "GitHub API rate limit exceeded"}
|
179 |
+
|
180 |
elif response.status_code != 200:
|
181 |
return {"status": "error", "message": f"GitHub API error: {response.status_code}"}
|
182 |
+
|
183 |
return {"status": "success", "data": response.json()}
|
184 |
+
|
185 |
except requests.RequestException as e:
|
186 |
+
logger.error("Network error fetching user info: %s", str(e))
|
187 |
+
|
188 |
return {"status": "error", "message": f"Network error: {str(e)}"}
|
189 |
|
190 |
|
|
|
203 |
all_repos = []
|
204 |
page = 1
|
205 |
per_page = 100 # Maximum allowed by GitHub API
|
206 |
+
|
207 |
while True:
|
208 |
+
|
209 |
url = f"https://api.github.com/users/{username}/repos"
|
210 |
params = {
|
211 |
"type": "public",
|
|
|
218 |
"Accept": "application/vnd.github.v3+json",
|
219 |
"User-Agent": "Resumate-App/1.0"
|
220 |
}
|
221 |
+
|
222 |
response = requests.get(url, headers=headers, params=params, timeout=10)
|
223 |
+
|
224 |
if response.status_code != 200:
|
225 |
return {"status": "error", "message": f"GitHub API error: {response.status_code}"}
|
226 |
+
|
227 |
repos = response.json()
|
228 |
+
|
229 |
if not repos: # No more repositories
|
230 |
break
|
231 |
+
|
232 |
all_repos.extend(repos)
|
233 |
+
|
234 |
# If we got less than per_page, we've reached the end
|
235 |
if len(repos) < per_page:
|
236 |
break
|
237 |
+
|
238 |
page += 1
|
239 |
+
|
240 |
# Safety limit to prevent infinite loops
|
241 |
if page > 10: # Max 1000 repos
|
242 |
break
|
243 |
+
|
244 |
return {"status": "success", "data": all_repos}
|
245 |
+
|
246 |
except requests.RequestException as e:
|
247 |
+
logger.error("Network error fetching repositories: %s", str(e))
|
248 |
+
|
249 |
return {"status": "error", "message": f"Network error: {str(e)}"}
|
250 |
|
251 |
|
|
|
260 |
List[Dict]: Processed repository data
|
261 |
"""
|
262 |
processed = []
|
263 |
+
|
264 |
for repo in repos:
|
265 |
+
|
266 |
# Skip forks unless they have significant modifications
|
267 |
if repo.get("fork", False) and repo.get("stargazers_count", 0) == 0:
|
268 |
continue
|
269 |
+
|
270 |
processed_repo = {
|
271 |
"name": repo.get("name", ""),
|
272 |
"description": repo.get("description", ""),
|
|
|
284 |
"has_wiki": repo.get("has_wiki", False),
|
285 |
"has_pages": repo.get("has_pages", False)
|
286 |
}
|
287 |
+
|
288 |
processed.append(processed_repo)
|
289 |
+
|
290 |
return processed
|
291 |
|
292 |
|
|
|
300 |
Returns:
|
301 |
str: Formatted text ready for LLM context
|
302 |
"""
|
303 |
+
|
304 |
if github_result.get("status") != "success":
|
305 |
+
return "GitHub repositories could not be retrieved: " + \
|
306 |
+
f"{github_result.get('message', 'Unknown error')}"
|
307 |
+
|
308 |
repositories = github_result.get("repositories", [])
|
309 |
metadata = github_result.get("metadata", {})
|
310 |
+
|
311 |
if not repositories:
|
312 |
return f"No public repositories found for {metadata.get('username', 'user')}"
|
313 |
+
|
314 |
formatted_parts = [
|
315 |
"=== GITHUB REPOSITORIES ===\n",
|
316 |
f"Profile: {metadata.get('profile_url', 'N/A')}",
|
317 |
f"Username: {metadata.get('username', 'N/A')}",
|
318 |
f"Public Repositories: {len(repositories)}\n"
|
319 |
]
|
320 |
+
|
321 |
for i, repo in enumerate(repositories[:20], 1): # Limit to top 20 repos
|
322 |
repo_info = [
|
323 |
f"[REPOSITORY {i}]",
|
324 |
f"Name: {repo['name']}",
|
325 |
f"URL: {repo['html_url']}"
|
326 |
]
|
327 |
+
|
328 |
if repo['description']:
|
329 |
repo_info.append(f"Description: {repo['description']}")
|
330 |
+
|
331 |
if repo['language']:
|
332 |
repo_info.append(f"Primary Language: {repo['language']}")
|
333 |
+
|
334 |
if repo['topics']:
|
335 |
repo_info.append(f"Topics: {', '.join(repo['topics'][:5])}") # Limit topics
|
336 |
+
|
337 |
repo_info.extend([
|
338 |
f"Stars: {repo['stars']} | Forks: {repo['forks']}",
|
339 |
f"Last Updated: {repo['updated_at'][:10]}", # Just the date
|
340 |
"" # Empty line between repositories
|
341 |
])
|
342 |
+
|
343 |
formatted_parts.extend(repo_info)
|
344 |
+
|
345 |
if len(repositories) > 20:
|
346 |
formatted_parts.append(f"... and {len(repositories) - 20} more repositories")
|
347 |
+
|
348 |
formatted_parts.append("\n=== END GITHUB REPOSITORIES ===")
|
349 |
+
|
350 |
return '\n'.join(formatted_parts)
|
functions/gradio.py
CHANGED
@@ -31,55 +31,61 @@ def process_inputs(linkedin_pdf, github_url, job_post_text, user_instructions):
|
|
31 |
result = ""
|
32 |
extraction_result = None
|
33 |
logger.info("Processing user inputs from Gradio interface")
|
34 |
-
|
35 |
# Process LinkedIn PDF file
|
36 |
if linkedin_pdf is not None:
|
37 |
result += "✅ LinkedIn Resume PDF uploaded\n"
|
38 |
-
logger.info(
|
39 |
-
|
40 |
# Extract and structure text from the PDF
|
41 |
extraction_result = extract_text_from_linkedin_pdf(linkedin_pdf.name)
|
42 |
-
|
43 |
if extraction_result["status"] == "success":
|
44 |
result += " ✅ Text extraction successful\n\n"
|
45 |
logger.info("LinkedIn PDF text extraction successful")
|
46 |
|
47 |
elif extraction_result["status"] == "warning":
|
48 |
result += f" ⚠️ Text extraction: {extraction_result['message']}\n\n"
|
49 |
-
logger.warning(
|
50 |
else:
|
51 |
result += f" ❌ Text extraction failed: {extraction_result['message']}\n\n"
|
52 |
-
logger.error(
|
53 |
else:
|
54 |
result += "❌ No LinkedIn resume PDF file uploaded\n\n"
|
55 |
logger.info("No LinkedIn PDF file provided")
|
56 |
-
|
57 |
# Process GitHub profile
|
58 |
if github_url and github_url.strip():
|
59 |
result += "✅ GitHub Profile URL provided\n"
|
60 |
-
logger.info(
|
61 |
-
|
62 |
# Retrieve repositories from GitHub
|
63 |
github_result = get_github_repositories(github_url)
|
64 |
-
|
65 |
if github_result["status"] == "success":
|
66 |
result += " ✅ GitHub list download successful\n\n"
|
67 |
-
logger.info(
|
68 |
-
|
|
|
|
|
|
|
69 |
else:
|
70 |
result += f" ❌ GitHub extraction failed: {github_result['message']}\n\n"
|
71 |
-
logger.error(
|
72 |
else:
|
73 |
result += "❌ No GitHub profile URL provided\n\n"
|
74 |
logger.info("No GitHub URL provided")
|
75 |
-
|
76 |
# Process job post text
|
77 |
if job_post_text and job_post_text.strip():
|
78 |
result += "✅ Job post text provided\n"
|
79 |
-
logger.info(
|
|
|
80 |
summary = summarize_job_call(job_post_text)
|
81 |
-
result += " ✅ Job post summary generated\n"
|
82 |
result += summary
|
|
|
|
|
|
|
83 |
else:
|
84 |
result += "❌ Job post not provided\n"
|
85 |
logger.info("No job post text provided")
|
@@ -87,7 +93,7 @@ def process_inputs(linkedin_pdf, github_url, job_post_text, user_instructions):
|
|
87 |
# Process user instructions
|
88 |
if user_instructions and user_instructions.strip():
|
89 |
result += "✅ Additional instructions provided\n"
|
90 |
-
logger.info(
|
91 |
else:
|
92 |
result += "ℹ️ No additional instructions provided\n"
|
93 |
logger.info("No additional instructions provided")
|
@@ -97,21 +103,22 @@ def process_inputs(linkedin_pdf, github_url, job_post_text, user_instructions):
|
|
97 |
# Generate resume only if we have valid extraction result
|
98 |
if extraction_result and extraction_result.get("status") == "success":
|
99 |
try:
|
100 |
-
|
101 |
-
result +=
|
102 |
logger.info("Resume generation completed successfully")
|
103 |
-
|
|
|
104 |
result += f"\n❌ Resume generation failed: {str(e)}\n"
|
105 |
-
logger.error(
|
106 |
else:
|
107 |
-
result +=
|
108 |
result += "Please ensure you upload a valid LinkedIn PDF export file.\n"
|
109 |
logger.warning("Resume generation skipped - no valid LinkedIn data available")
|
110 |
-
|
111 |
return result
|
112 |
|
113 |
|
114 |
-
def get_processed_data(linkedin_pdf, github_url, job_post_text,
|
115 |
"""
|
116 |
Get structured data from all inputs for further processing.
|
117 |
|
@@ -119,33 +126,41 @@ def get_processed_data(linkedin_pdf, github_url, job_post_text, user_instruction
|
|
119 |
linkedin_pdf: Uploaded LinkedIn resume export PDF file
|
120 |
github_url (str): GitHub profile URL
|
121 |
job_post_text (str): Job post text content
|
122 |
-
|
123 |
-
|
124 |
Returns:
|
125 |
dict: Structured data containing all processed information
|
126 |
"""
|
|
|
|
|
|
|
|
|
127 |
processed_data = {
|
128 |
"linkedin": None,
|
129 |
"github": None,
|
130 |
-
"job_post": job_post_text
|
131 |
-
"user_instructions":
|
132 |
"errors": []
|
133 |
}
|
134 |
-
|
135 |
# Process LinkedIn PDF
|
136 |
if linkedin_pdf is not None:
|
137 |
extraction_result = extract_text_from_linkedin_pdf(linkedin_pdf.name)
|
|
|
138 |
if extraction_result["status"] == "success":
|
139 |
processed_data["linkedin"] = extraction_result
|
|
|
140 |
else:
|
141 |
processed_data["errors"].append(f"LinkedIn: {extraction_result['message']}")
|
142 |
-
|
143 |
# Process GitHub profile
|
144 |
if github_url and github_url.strip():
|
145 |
github_result = get_github_repositories(github_url)
|
|
|
146 |
if github_result["status"] == "success":
|
147 |
processed_data["github"] = github_result
|
|
|
148 |
else:
|
149 |
processed_data["errors"].append(f"GitHub: {github_result['message']}")
|
150 |
-
|
151 |
return processed_data
|
|
|
31 |
result = ""
|
32 |
extraction_result = None
|
33 |
logger.info("Processing user inputs from Gradio interface")
|
34 |
+
|
35 |
# Process LinkedIn PDF file
|
36 |
if linkedin_pdf is not None:
|
37 |
result += "✅ LinkedIn Resume PDF uploaded\n"
|
38 |
+
logger.info("Processing LinkedIn PDF: %s", linkedin_pdf.name)
|
39 |
+
|
40 |
# Extract and structure text from the PDF
|
41 |
extraction_result = extract_text_from_linkedin_pdf(linkedin_pdf.name)
|
42 |
+
|
43 |
if extraction_result["status"] == "success":
|
44 |
result += " ✅ Text extraction successful\n\n"
|
45 |
logger.info("LinkedIn PDF text extraction successful")
|
46 |
|
47 |
elif extraction_result["status"] == "warning":
|
48 |
result += f" ⚠️ Text extraction: {extraction_result['message']}\n\n"
|
49 |
+
logger.warning("LinkedIn PDF extraction warning: %s", extraction_result['message'])
|
50 |
else:
|
51 |
result += f" ❌ Text extraction failed: {extraction_result['message']}\n\n"
|
52 |
+
logger.error("LinkedIn PDF extraction failed: %s", extraction_result['message'])
|
53 |
else:
|
54 |
result += "❌ No LinkedIn resume PDF file uploaded\n\n"
|
55 |
logger.info("No LinkedIn PDF file provided")
|
56 |
+
|
57 |
# Process GitHub profile
|
58 |
if github_url and github_url.strip():
|
59 |
result += "✅ GitHub Profile URL provided\n"
|
60 |
+
logger.info("Processing GitHub URL: %s", github_url)
|
61 |
+
|
62 |
# Retrieve repositories from GitHub
|
63 |
github_result = get_github_repositories(github_url)
|
64 |
+
|
65 |
if github_result["status"] == "success":
|
66 |
result += " ✅ GitHub list download successful\n\n"
|
67 |
+
logger.info(
|
68 |
+
"GitHub repositories retrieved successfully for %s",
|
69 |
+
github_result['metadata']['username']
|
70 |
+
)
|
71 |
+
|
72 |
else:
|
73 |
result += f" ❌ GitHub extraction failed: {github_result['message']}\n\n"
|
74 |
+
logger.error("GitHub extraction failed: %s", github_result['message'])
|
75 |
else:
|
76 |
result += "❌ No GitHub profile URL provided\n\n"
|
77 |
logger.info("No GitHub URL provided")
|
78 |
+
|
79 |
# Process job post text
|
80 |
if job_post_text and job_post_text.strip():
|
81 |
result += "✅ Job post text provided\n"
|
82 |
+
logger.info("Job post text provided (%d characters)", len(job_post_text))
|
83 |
+
|
84 |
summary = summarize_job_call(job_post_text)
|
|
|
85 |
result += summary
|
86 |
+
result += " ✅ Job post summary generated\n"
|
87 |
+
logger.info("Job post summary generated (%d characters)", len(summary))
|
88 |
+
|
89 |
else:
|
90 |
result += "❌ Job post not provided\n"
|
91 |
logger.info("No job post text provided")
|
|
|
93 |
# Process user instructions
|
94 |
if user_instructions and user_instructions.strip():
|
95 |
result += "✅ Additional instructions provided\n"
|
96 |
+
logger.info("User instructions provided (%d characters)", len(user_instructions))
|
97 |
else:
|
98 |
result += "ℹ️ No additional instructions provided\n"
|
99 |
logger.info("No additional instructions provided")
|
|
|
103 |
# Generate resume only if we have valid extraction result
|
104 |
if extraction_result and extraction_result.get("status") == "success":
|
105 |
try:
|
106 |
+
_ = write_resume(extraction_result, user_instructions)
|
107 |
+
result += "\n✅ Resume generated successfully\n"
|
108 |
logger.info("Resume generation completed successfully")
|
109 |
+
|
110 |
+
except Exception as e: # pylint: disable=broad-exception-caught
|
111 |
result += f"\n❌ Resume generation failed: {str(e)}\n"
|
112 |
+
logger.error("Resume generation failed: %s", str(e))
|
113 |
else:
|
114 |
+
result += "\n❌ Cannot generate resume: No valid LinkedIn data extracted\n"
|
115 |
result += "Please ensure you upload a valid LinkedIn PDF export file.\n"
|
116 |
logger.warning("Resume generation skipped - no valid LinkedIn data available")
|
117 |
+
|
118 |
return result
|
119 |
|
120 |
|
121 |
+
def get_processed_data(linkedin_pdf, github_url, job_post_text, instructions):
|
122 |
"""
|
123 |
Get structured data from all inputs for further processing.
|
124 |
|
|
|
126 |
linkedin_pdf: Uploaded LinkedIn resume export PDF file
|
127 |
github_url (str): GitHub profile URL
|
128 |
job_post_text (str): Job post text content
|
129 |
+
instructions (str): Additional instructions from the user
|
130 |
+
|
131 |
Returns:
|
132 |
dict: Structured data containing all processed information
|
133 |
"""
|
134 |
+
|
135 |
+
job_post_text = job_post_text.strip() if job_post_text and job_post_text.strip() else None
|
136 |
+
instructions = instructions.strip() if instructions and instructions.strip() else None
|
137 |
+
|
138 |
processed_data = {
|
139 |
"linkedin": None,
|
140 |
"github": None,
|
141 |
+
"job_post": job_post_text,
|
142 |
+
"user_instructions": instructions,
|
143 |
"errors": []
|
144 |
}
|
145 |
+
|
146 |
# Process LinkedIn PDF
|
147 |
if linkedin_pdf is not None:
|
148 |
extraction_result = extract_text_from_linkedin_pdf(linkedin_pdf.name)
|
149 |
+
|
150 |
if extraction_result["status"] == "success":
|
151 |
processed_data["linkedin"] = extraction_result
|
152 |
+
|
153 |
else:
|
154 |
processed_data["errors"].append(f"LinkedIn: {extraction_result['message']}")
|
155 |
+
|
156 |
# Process GitHub profile
|
157 |
if github_url and github_url.strip():
|
158 |
github_result = get_github_repositories(github_url)
|
159 |
+
|
160 |
if github_result["status"] == "success":
|
161 |
processed_data["github"] = github_result
|
162 |
+
|
163 |
else:
|
164 |
processed_data["errors"].append(f"GitHub: {github_result['message']}")
|
165 |
+
|
166 |
return processed_data
|
functions/job_call.py
CHANGED
@@ -1,9 +1,15 @@
|
|
1 |
'''Functions for summarizing and formatting job calls.'''
|
2 |
|
3 |
import os
|
|
|
4 |
from openai import OpenAI
|
5 |
from configuration import JOB_CALL_EXTRACTION_PROMPT
|
6 |
|
|
|
|
|
|
|
|
|
|
|
7 |
def summarize_job_call(job_call: str) -> str:
|
8 |
'''Extracts and summarizes key information from job call.'''
|
9 |
|
@@ -51,4 +57,4 @@ def summarize_job_call(job_call: str) -> str:
|
|
51 |
else:
|
52 |
summary = None
|
53 |
|
54 |
-
return summary
|
|
|
1 |
'''Functions for summarizing and formatting job calls.'''
|
2 |
|
3 |
import os
|
4 |
+
import logging
|
5 |
from openai import OpenAI
|
6 |
from configuration import JOB_CALL_EXTRACTION_PROMPT
|
7 |
|
8 |
+
# Set up logging
|
9 |
+
logging.basicConfig(level=logging.INFO)
|
10 |
+
logger = logging.getLogger(__name__)
|
11 |
+
|
12 |
+
|
13 |
def summarize_job_call(job_call: str) -> str:
|
14 |
'''Extracts and summarizes key information from job call.'''
|
15 |
|
|
|
57 |
else:
|
58 |
summary = None
|
59 |
|
60 |
+
return summary
|
functions/linkedin_resume.py
CHANGED
@@ -9,6 +9,8 @@ import re
|
|
9 |
import logging
|
10 |
import io
|
11 |
import os
|
|
|
|
|
12 |
import PyPDF2
|
13 |
|
14 |
# Set up logging
|
@@ -41,35 +43,37 @@ def extract_text_from_linkedin_pdf(pdf_file) -> dict:
|
|
41 |
"""
|
42 |
if pdf_file is None:
|
43 |
return {"status": "error", "message": "No PDF file provided"}
|
44 |
-
|
45 |
try:
|
46 |
# Get filename from path
|
47 |
filename = os.path.basename(pdf_file)
|
48 |
-
|
49 |
# Read the PDF file from the file path
|
50 |
with open(pdf_file, 'rb') as file:
|
51 |
file_content = file.read()
|
52 |
file_size = len(file_content)
|
53 |
-
|
54 |
# Create PDF reader from the file content
|
55 |
pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_content))
|
56 |
-
|
57 |
# Extract text from all pages
|
58 |
extracted_text = ""
|
59 |
num_pages = len(pdf_reader.pages)
|
60 |
-
|
61 |
for page_num in range(num_pages):
|
62 |
try:
|
63 |
page = pdf_reader.pages[page_num]
|
64 |
page_text = page.extract_text()
|
65 |
extracted_text += page_text + "\n\n"
|
66 |
-
|
67 |
-
|
|
|
|
|
68 |
continue
|
69 |
-
|
70 |
# Clean and structure the extracted text for LLM consumption
|
71 |
structured_content = _structure_resume_text(extracted_text)
|
72 |
-
|
73 |
if not structured_content["full_text"].strip():
|
74 |
return {
|
75 |
"status": "warning",
|
@@ -81,10 +85,14 @@ def extract_text_from_linkedin_pdf(pdf_file) -> dict:
|
|
81 |
},
|
82 |
"message": "PDF processed but no text content was extracted"
|
83 |
}
|
84 |
-
|
85 |
-
logger.info(
|
86 |
-
|
87 |
-
|
|
|
|
|
|
|
|
|
88 |
"status": "success",
|
89 |
"structured_text": structured_content,
|
90 |
"metadata": {
|
@@ -95,9 +103,26 @@ def extract_text_from_linkedin_pdf(pdf_file) -> dict:
|
|
95 |
},
|
96 |
"message": f"Text extracted and structured successfully from {num_pages} pages"
|
97 |
}
|
98 |
-
|
99 |
-
|
100 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
return {
|
102 |
"status": "error",
|
103 |
"message": f"Failed to extract text from PDF: {str(e)}"
|
@@ -124,10 +149,10 @@ def _structure_resume_text(text: str) -> dict:
|
|
124 |
"word_count": 0,
|
125 |
"section_count": 0
|
126 |
}
|
127 |
-
|
128 |
# Clean the text first
|
129 |
cleaned_text = _clean_extracted_text(text)
|
130 |
-
|
131 |
# Define section patterns (common LinkedIn export sections)
|
132 |
section_patterns = {
|
133 |
"contact_info": r"(?i)(contact|personal|profile)\s*(?:information)?",
|
@@ -137,56 +162,66 @@ def _structure_resume_text(text: str) -> dict:
|
|
137 |
"skills": r"(?i)(skills|competencies|technologies|technical)",
|
138 |
"certifications": r"(?i)(certification|certificate|license)",
|
139 |
}
|
140 |
-
|
141 |
# Split text into lines for processing
|
142 |
lines = cleaned_text.split('\n')
|
143 |
sections = {}
|
144 |
current_section = "general"
|
145 |
current_content = []
|
146 |
-
|
147 |
for line in lines:
|
148 |
line = line.strip()
|
|
|
149 |
if not line:
|
150 |
continue
|
151 |
-
|
152 |
# Check if line is a section header
|
153 |
section_found = None
|
|
|
154 |
for section_name, pattern in section_patterns.items():
|
155 |
if re.match(pattern, line):
|
|
|
156 |
section_found = section_name
|
157 |
break
|
158 |
-
|
159 |
if section_found:
|
|
|
160 |
# Save previous section content
|
161 |
if current_content:
|
162 |
sections[current_section] = '\n'.join(current_content)
|
163 |
-
|
164 |
# Start new section
|
165 |
current_section = section_found
|
166 |
current_content = [line]
|
|
|
167 |
else:
|
168 |
current_content.append(line)
|
169 |
-
|
170 |
# Save the last section
|
171 |
if current_content:
|
172 |
sections[current_section] = '\n'.join(current_content)
|
173 |
-
|
174 |
# Create a structured summary for LLM context
|
175 |
summary_parts = []
|
|
|
176 |
if "contact_info" in sections:
|
177 |
summary_parts.append(f"CONTACT: {sections['contact_info'][:200]}...")
|
|
|
178 |
if "summary" in sections:
|
179 |
summary_parts.append(f"SUMMARY: {sections['summary']}")
|
|
|
180 |
if "experience" in sections:
|
181 |
summary_parts.append(f"EXPERIENCE: {sections['experience'][:300]}...")
|
|
|
182 |
if "education" in sections:
|
183 |
summary_parts.append(f"EDUCATION: {sections['education']}")
|
|
|
184 |
if "skills" in sections:
|
185 |
summary_parts.append(f"SKILLS: {sections['skills']}")
|
186 |
-
|
187 |
# Create LLM-optimized format
|
188 |
-
llm_formatted_text = _format_for_llm(sections
|
189 |
-
|
190 |
return {
|
191 |
"sections": sections,
|
192 |
"full_text": cleaned_text,
|
@@ -198,7 +233,7 @@ def _structure_resume_text(text: str) -> dict:
|
|
198 |
}
|
199 |
|
200 |
|
201 |
-
def _format_for_llm(sections: dict
|
202 |
"""
|
203 |
Format the resume sections in an optimal way for LLM processing.
|
204 |
|
@@ -210,32 +245,35 @@ def _format_for_llm(sections: dict, full_text: str) -> str:
|
|
210 |
str: LLM-optimized formatted text
|
211 |
"""
|
212 |
formatted_parts = ["=== RESUME CONTENT ===\n"]
|
213 |
-
|
214 |
# Prioritize sections in logical order for LLM
|
215 |
-
priority_order = ["summary", "contact_info", "experience", "education", "skills",
|
216 |
"certifications", "projects", "achievements", "languages", "volunteer"]
|
217 |
-
|
218 |
# Add prioritized sections
|
219 |
for section_name in priority_order:
|
220 |
if section_name in sections:
|
|
|
221 |
formatted_parts.append(f"[{section_name.upper().replace('_', ' ')}]")
|
222 |
formatted_parts.append(sections[section_name])
|
223 |
formatted_parts.append("") # Empty line between sections
|
224 |
-
|
225 |
# Add any remaining sections
|
226 |
for section_name, content in sections.items():
|
227 |
if section_name not in priority_order and section_name != "general":
|
|
|
228 |
formatted_parts.append(f"[{section_name.upper().replace('_', ' ')}]")
|
229 |
formatted_parts.append(content)
|
230 |
formatted_parts.append("")
|
231 |
-
|
232 |
# Add general content if exists
|
233 |
if "general" in sections:
|
|
|
234 |
formatted_parts.append("[ADDITIONAL INFORMATION]")
|
235 |
formatted_parts.append(sections["general"])
|
236 |
-
|
237 |
formatted_parts.append("\n=== END RESUME ===")
|
238 |
-
|
239 |
return '\n'.join(formatted_parts)
|
240 |
|
241 |
|
@@ -251,39 +289,40 @@ def _clean_extracted_text(text: str) -> str:
|
|
251 |
"""
|
252 |
if not text:
|
253 |
return ""
|
254 |
-
|
255 |
# Remove excessive whitespace and normalize line endings
|
256 |
text = re.sub(r'\r\n', '\n', text)
|
257 |
text = re.sub(r'\r', '\n', text)
|
258 |
-
|
259 |
# Split into lines and clean each line
|
260 |
lines = text.split('\n')
|
261 |
cleaned_lines = []
|
262 |
-
|
263 |
for line in lines:
|
|
|
264 |
# Strip whitespace
|
265 |
cleaned_line = line.strip()
|
266 |
-
|
267 |
# Skip empty lines and very short lines (likely artifacts)
|
268 |
if len(cleaned_line) < 2:
|
269 |
continue
|
270 |
-
|
271 |
# Remove common PDF artifacts
|
272 |
cleaned_line = re.sub(r'^\d+$', '', cleaned_line) # Page numbers
|
273 |
cleaned_line = re.sub(r'^[|\-_=]+$', '', cleaned_line) # Separator lines
|
274 |
-
|
275 |
if cleaned_line:
|
276 |
cleaned_lines.append(cleaned_line)
|
277 |
-
|
278 |
# Join lines and normalize spacing
|
279 |
cleaned_text = '\n'.join(cleaned_lines)
|
280 |
-
|
281 |
# Normalize multiple spaces to single spaces
|
282 |
cleaned_text = re.sub(r' +', ' ', cleaned_text)
|
283 |
-
|
284 |
# Normalize multiple newlines to maximum of 2
|
285 |
cleaned_text = re.sub(r'\n{3,}', '\n\n', cleaned_text)
|
286 |
-
|
287 |
return cleaned_text.strip()
|
288 |
|
289 |
|
@@ -299,8 +338,8 @@ def get_llm_context_from_resume(extraction_result: dict) -> str:
|
|
299 |
"""
|
300 |
if extraction_result.get("status") != "success":
|
301 |
return ""
|
302 |
-
|
303 |
structured_text = extraction_result.get("structured_text", {})
|
304 |
-
|
305 |
# Return the LLM-formatted version if available, otherwise fall back to full text
|
306 |
return structured_text.get("llm_formatted", structured_text.get("full_text", ""))
|
|
|
9 |
import logging
|
10 |
import io
|
11 |
import os
|
12 |
+
import json
|
13 |
+
from pathlib import Path
|
14 |
import PyPDF2
|
15 |
|
16 |
# Set up logging
|
|
|
43 |
"""
|
44 |
if pdf_file is None:
|
45 |
return {"status": "error", "message": "No PDF file provided"}
|
46 |
+
|
47 |
try:
|
48 |
# Get filename from path
|
49 |
filename = os.path.basename(pdf_file)
|
50 |
+
|
51 |
# Read the PDF file from the file path
|
52 |
with open(pdf_file, 'rb') as file:
|
53 |
file_content = file.read()
|
54 |
file_size = len(file_content)
|
55 |
+
|
56 |
# Create PDF reader from the file content
|
57 |
pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_content))
|
58 |
+
|
59 |
# Extract text from all pages
|
60 |
extracted_text = ""
|
61 |
num_pages = len(pdf_reader.pages)
|
62 |
+
|
63 |
for page_num in range(num_pages):
|
64 |
try:
|
65 |
page = pdf_reader.pages[page_num]
|
66 |
page_text = page.extract_text()
|
67 |
extracted_text += page_text + "\n\n"
|
68 |
+
|
69 |
+
except Exception as e: # pylint: disable=broad-exception-caught
|
70 |
+
logger.warning("Error extracting text from page %d: %s", page_num + 1, str(e))
|
71 |
+
|
72 |
continue
|
73 |
+
|
74 |
# Clean and structure the extracted text for LLM consumption
|
75 |
structured_content = _structure_resume_text(extracted_text)
|
76 |
+
|
77 |
if not structured_content["full_text"].strip():
|
78 |
return {
|
79 |
"status": "warning",
|
|
|
85 |
},
|
86 |
"message": "PDF processed but no text content was extracted"
|
87 |
}
|
88 |
+
|
89 |
+
logger.info(
|
90 |
+
"Successfully extracted and structured %d characters from %s",
|
91 |
+
len(structured_content['full_text']),
|
92 |
+
filename
|
93 |
+
)
|
94 |
+
|
95 |
+
result = {
|
96 |
"status": "success",
|
97 |
"structured_text": structured_content,
|
98 |
"metadata": {
|
|
|
103 |
},
|
104 |
"message": f"Text extracted and structured successfully from {num_pages} pages"
|
105 |
}
|
106 |
+
|
107 |
+
# Save results to JSON file
|
108 |
+
try:
|
109 |
+
data_dir = Path(__file__).parent.parent / "data"
|
110 |
+
data_dir.mkdir(exist_ok=True)
|
111 |
+
|
112 |
+
output_file = data_dir / "linkedin_resume.json"
|
113 |
+
with open(output_file, 'w', encoding='utf-8') as f:
|
114 |
+
json.dump(result, f, indent=2, ensure_ascii=False)
|
115 |
+
|
116 |
+
logger.info("LinkedIn resume extraction saved to %s", output_file)
|
117 |
+
|
118 |
+
except Exception as save_error: # pylint: disable=broad-exception-caught
|
119 |
+
logger.warning("Failed to save LinkedIn resume extraction to file: %s", str(save_error))
|
120 |
+
|
121 |
+
return result
|
122 |
+
|
123 |
+
except Exception as e: # pylint: disable=broad-exception-caught
|
124 |
+
logger.error("Error processing PDF file: %s", str(e))
|
125 |
+
|
126 |
return {
|
127 |
"status": "error",
|
128 |
"message": f"Failed to extract text from PDF: {str(e)}"
|
|
|
149 |
"word_count": 0,
|
150 |
"section_count": 0
|
151 |
}
|
152 |
+
|
153 |
# Clean the text first
|
154 |
cleaned_text = _clean_extracted_text(text)
|
155 |
+
|
156 |
# Define section patterns (common LinkedIn export sections)
|
157 |
section_patterns = {
|
158 |
"contact_info": r"(?i)(contact|personal|profile)\s*(?:information)?",
|
|
|
162 |
"skills": r"(?i)(skills|competencies|technologies|technical)",
|
163 |
"certifications": r"(?i)(certification|certificate|license)",
|
164 |
}
|
165 |
+
|
166 |
# Split text into lines for processing
|
167 |
lines = cleaned_text.split('\n')
|
168 |
sections = {}
|
169 |
current_section = "general"
|
170 |
current_content = []
|
171 |
+
|
172 |
for line in lines:
|
173 |
line = line.strip()
|
174 |
+
|
175 |
if not line:
|
176 |
continue
|
177 |
+
|
178 |
# Check if line is a section header
|
179 |
section_found = None
|
180 |
+
|
181 |
for section_name, pattern in section_patterns.items():
|
182 |
if re.match(pattern, line):
|
183 |
+
|
184 |
section_found = section_name
|
185 |
break
|
186 |
+
|
187 |
if section_found:
|
188 |
+
|
189 |
# Save previous section content
|
190 |
if current_content:
|
191 |
sections[current_section] = '\n'.join(current_content)
|
192 |
+
|
193 |
# Start new section
|
194 |
current_section = section_found
|
195 |
current_content = [line]
|
196 |
+
|
197 |
else:
|
198 |
current_content.append(line)
|
199 |
+
|
200 |
# Save the last section
|
201 |
if current_content:
|
202 |
sections[current_section] = '\n'.join(current_content)
|
203 |
+
|
204 |
# Create a structured summary for LLM context
|
205 |
summary_parts = []
|
206 |
+
|
207 |
if "contact_info" in sections:
|
208 |
summary_parts.append(f"CONTACT: {sections['contact_info'][:200]}...")
|
209 |
+
|
210 |
if "summary" in sections:
|
211 |
summary_parts.append(f"SUMMARY: {sections['summary']}")
|
212 |
+
|
213 |
if "experience" in sections:
|
214 |
summary_parts.append(f"EXPERIENCE: {sections['experience'][:300]}...")
|
215 |
+
|
216 |
if "education" in sections:
|
217 |
summary_parts.append(f"EDUCATION: {sections['education']}")
|
218 |
+
|
219 |
if "skills" in sections:
|
220 |
summary_parts.append(f"SKILLS: {sections['skills']}")
|
221 |
+
|
222 |
# Create LLM-optimized format
|
223 |
+
llm_formatted_text = _format_for_llm(sections)
|
224 |
+
|
225 |
return {
|
226 |
"sections": sections,
|
227 |
"full_text": cleaned_text,
|
|
|
233 |
}
|
234 |
|
235 |
|
236 |
+
def _format_for_llm(sections: dict) -> str:
|
237 |
"""
|
238 |
Format the resume sections in an optimal way for LLM processing.
|
239 |
|
|
|
245 |
str: LLM-optimized formatted text
|
246 |
"""
|
247 |
formatted_parts = ["=== RESUME CONTENT ===\n"]
|
248 |
+
|
249 |
# Prioritize sections in logical order for LLM
|
250 |
+
priority_order = ["summary", "contact_info", "experience", "education", "skills",
|
251 |
"certifications", "projects", "achievements", "languages", "volunteer"]
|
252 |
+
|
253 |
# Add prioritized sections
|
254 |
for section_name in priority_order:
|
255 |
if section_name in sections:
|
256 |
+
|
257 |
formatted_parts.append(f"[{section_name.upper().replace('_', ' ')}]")
|
258 |
formatted_parts.append(sections[section_name])
|
259 |
formatted_parts.append("") # Empty line between sections
|
260 |
+
|
261 |
# Add any remaining sections
|
262 |
for section_name, content in sections.items():
|
263 |
if section_name not in priority_order and section_name != "general":
|
264 |
+
|
265 |
formatted_parts.append(f"[{section_name.upper().replace('_', ' ')}]")
|
266 |
formatted_parts.append(content)
|
267 |
formatted_parts.append("")
|
268 |
+
|
269 |
# Add general content if exists
|
270 |
if "general" in sections:
|
271 |
+
|
272 |
formatted_parts.append("[ADDITIONAL INFORMATION]")
|
273 |
formatted_parts.append(sections["general"])
|
274 |
+
|
275 |
formatted_parts.append("\n=== END RESUME ===")
|
276 |
+
|
277 |
return '\n'.join(formatted_parts)
|
278 |
|
279 |
|
|
|
289 |
"""
|
290 |
if not text:
|
291 |
return ""
|
292 |
+
|
293 |
# Remove excessive whitespace and normalize line endings
|
294 |
text = re.sub(r'\r\n', '\n', text)
|
295 |
text = re.sub(r'\r', '\n', text)
|
296 |
+
|
297 |
# Split into lines and clean each line
|
298 |
lines = text.split('\n')
|
299 |
cleaned_lines = []
|
300 |
+
|
301 |
for line in lines:
|
302 |
+
|
303 |
# Strip whitespace
|
304 |
cleaned_line = line.strip()
|
305 |
+
|
306 |
# Skip empty lines and very short lines (likely artifacts)
|
307 |
if len(cleaned_line) < 2:
|
308 |
continue
|
309 |
+
|
310 |
# Remove common PDF artifacts
|
311 |
cleaned_line = re.sub(r'^\d+$', '', cleaned_line) # Page numbers
|
312 |
cleaned_line = re.sub(r'^[|\-_=]+$', '', cleaned_line) # Separator lines
|
313 |
+
|
314 |
if cleaned_line:
|
315 |
cleaned_lines.append(cleaned_line)
|
316 |
+
|
317 |
# Join lines and normalize spacing
|
318 |
cleaned_text = '\n'.join(cleaned_lines)
|
319 |
+
|
320 |
# Normalize multiple spaces to single spaces
|
321 |
cleaned_text = re.sub(r' +', ' ', cleaned_text)
|
322 |
+
|
323 |
# Normalize multiple newlines to maximum of 2
|
324 |
cleaned_text = re.sub(r'\n{3,}', '\n\n', cleaned_text)
|
325 |
+
|
326 |
return cleaned_text.strip()
|
327 |
|
328 |
|
|
|
338 |
"""
|
339 |
if extraction_result.get("status") != "success":
|
340 |
return ""
|
341 |
+
|
342 |
structured_text = extraction_result.get("structured_text", {})
|
343 |
+
|
344 |
# Return the LLM-formatted version if available, otherwise fall back to full text
|
345 |
return structured_text.get("llm_formatted", structured_text.get("full_text", ""))
|
functions/writer_agent.py
CHANGED
@@ -23,7 +23,7 @@ def write_resume(content: str, user_instructions: str = None) -> str:
|
|
23 |
"""
|
24 |
|
25 |
if content['status'] == 'success':
|
26 |
-
|
27 |
agent = CodeAgent(
|
28 |
model=AGENT_MODEL,
|
29 |
tools=[],
|
@@ -36,7 +36,9 @@ def write_resume(content: str, user_instructions: str = None) -> str:
|
|
36 |
|
37 |
# Prepare instructions - combine default with user instructions
|
38 |
instructions = INSTRUCTIONS
|
|
|
39 |
if user_instructions and user_instructions.strip():
|
|
|
40 |
instructions += f"\n\nAdditional user instructions:\n{user_instructions.strip()}"
|
41 |
logger.info("Added user instructions to agent prompt")
|
42 |
|
@@ -44,21 +46,26 @@ def write_resume(content: str, user_instructions: str = None) -> str:
|
|
44 |
instructions + '\n' + json.dumps(content['structured_text']),
|
45 |
)
|
46 |
|
47 |
-
logger.info("submitted_answer: %s", submitted_answer)
|
48 |
-
|
49 |
# Create data directory if it doesn't exist
|
50 |
data_dir = 'data'
|
|
|
51 |
if not os.path.exists(data_dir):
|
|
|
52 |
os.makedirs(data_dir)
|
53 |
logger.info("Created data directory: %s", data_dir)
|
54 |
-
|
55 |
# Save the resume to resume.md in the data directory
|
56 |
resume_file_path = os.path.join(data_dir, 'resume.md')
|
|
|
57 |
try:
|
58 |
with open(resume_file_path, 'w', encoding='utf-8') as f:
|
59 |
f.write(submitted_answer)
|
|
|
60 |
logger.info("Resume saved to: %s", resume_file_path)
|
61 |
-
|
|
|
62 |
logger.error("Failed to save resume to file: %s", e)
|
63 |
-
|
64 |
-
return submitted_answer
|
|
|
23 |
"""
|
24 |
|
25 |
if content['status'] == 'success':
|
26 |
+
|
27 |
agent = CodeAgent(
|
28 |
model=AGENT_MODEL,
|
29 |
tools=[],
|
|
|
36 |
|
37 |
# Prepare instructions - combine default with user instructions
|
38 |
instructions = INSTRUCTIONS
|
39 |
+
|
40 |
if user_instructions and user_instructions.strip():
|
41 |
+
|
42 |
instructions += f"\n\nAdditional user instructions:\n{user_instructions.strip()}"
|
43 |
logger.info("Added user instructions to agent prompt")
|
44 |
|
|
|
46 |
instructions + '\n' + json.dumps(content['structured_text']),
|
47 |
)
|
48 |
|
49 |
+
logger.info("submitted_answer: %s", submitted_answer)
|
50 |
+
|
51 |
# Create data directory if it doesn't exist
|
52 |
data_dir = 'data'
|
53 |
+
|
54 |
if not os.path.exists(data_dir):
|
55 |
+
|
56 |
os.makedirs(data_dir)
|
57 |
logger.info("Created data directory: %s", data_dir)
|
58 |
+
|
59 |
# Save the resume to resume.md in the data directory
|
60 |
resume_file_path = os.path.join(data_dir, 'resume.md')
|
61 |
+
|
62 |
try:
|
63 |
with open(resume_file_path, 'w', encoding='utf-8') as f:
|
64 |
f.write(submitted_answer)
|
65 |
+
|
66 |
logger.info("Resume saved to: %s", resume_file_path)
|
67 |
+
|
68 |
+
except Exception as e: # pylint: disable=broad-exception-caught
|
69 |
logger.error("Failed to save resume to file: %s", e)
|
70 |
+
|
71 |
+
return submitted_answer
|