gperdrizet commited on
Commit
da25614
Β·
1 Parent(s): f5b66ec

Added function to retreive user's public GitHub repository list.

Browse files
Files changed (3) hide show
  1. functions/github.py +317 -0
  2. requirements.txt +2 -1
  3. resumate.py +28 -2
functions/github.py ADDED
@@ -0,0 +1,317 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ github.py
3
+
4
+ Functions for retrieving information from GitHub profiles and repositories.
5
+ """
6
+
7
+ import re
8
+ import logging
9
+ import requests
10
+ from typing import List, Dict, Optional
11
+ from urllib.parse import urlparse
12
+
13
+ # Set up logging
14
+ logging.basicConfig(level=logging.INFO)
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ def get_github_repositories(github_url: str) -> Dict:
19
+ """
20
+ Retrieve public repositories from a GitHub profile URL.
21
+
22
+ Args:
23
+ github_url (str): GitHub profile URL (e.g., https://github.com/username)
24
+
25
+ Returns:
26
+ dict: Dictionary containing status, repositories list, and metadata
27
+
28
+ Example:
29
+ {
30
+ "status": "success",
31
+ "repositories": [
32
+ {
33
+ "name": "repo-name",
34
+ "description": "Repository description",
35
+ "language": "Python",
36
+ "stars": 10,
37
+ "forks": 2,
38
+ "updated_at": "2024-01-01T00:00:00Z",
39
+ "html_url": "https://github.com/user/repo",
40
+ "topics": ["python", "api"]
41
+ }
42
+ ],
43
+ "metadata": {
44
+ "username": "username",
45
+ "total_repos": 25,
46
+ "public_repos": 20
47
+ },
48
+ "message": "Successfully retrieved repositories"
49
+ }
50
+ """
51
+ if not github_url or not github_url.strip():
52
+ return {"status": "error", "message": "No GitHub URL provided"}
53
+
54
+ try:
55
+ # Extract username from GitHub URL
56
+ username = _extract_github_username(github_url)
57
+ if not username:
58
+ return {"status": "error", "message": "Invalid GitHub URL format"}
59
+
60
+ logger.info(f"Fetching repositories for GitHub user: {username}")
61
+
62
+ # Get user info first
63
+ user_info = _get_github_user_info(username)
64
+ if user_info["status"] != "success":
65
+ return user_info
66
+
67
+ # Get repositories
68
+ repositories = _get_user_repositories(username)
69
+ if repositories["status"] != "success":
70
+ return repositories
71
+
72
+ # Process and structure repository data
73
+ processed_repos = _process_repository_data(repositories["data"])
74
+
75
+ return {
76
+ "status": "success",
77
+ "repositories": processed_repos,
78
+ "metadata": {
79
+ "username": username,
80
+ "total_repos": user_info["data"].get("public_repos", 0),
81
+ "public_repos": len(processed_repos),
82
+ "profile_url": github_url
83
+ },
84
+ "message": f"Successfully retrieved {len(processed_repos)} repositories"
85
+ }
86
+
87
+ except Exception as e:
88
+ logger.error(f"Error retrieving GitHub repositories: {str(e)}")
89
+ return {
90
+ "status": "error",
91
+ "message": f"Failed to retrieve GitHub repositories: {str(e)}"
92
+ }
93
+
94
+
95
+ def _extract_github_username(github_url: str) -> Optional[str]:
96
+ """
97
+ Extract username from GitHub URL.
98
+
99
+ Args:
100
+ github_url (str): GitHub profile URL
101
+
102
+ Returns:
103
+ Optional[str]: Username if valid URL, None otherwise
104
+ """
105
+ try:
106
+ # Clean up the URL
107
+ url = github_url.strip().rstrip('/')
108
+
109
+ # Handle various GitHub URL formats
110
+ patterns = [
111
+ r'github\.com/([^/]+)/?$', # https://github.com/username
112
+ r'github\.com/([^/]+)/.*', # https://github.com/username/anything
113
+ r'^([a-zA-Z0-9\-_]+)$' # Just username
114
+ ]
115
+
116
+ for pattern in patterns:
117
+ match = re.search(pattern, url)
118
+ if match:
119
+ username = match.group(1)
120
+ # Validate username format
121
+ if re.match(r'^[a-zA-Z0-9\-_]+$', username) and len(username) <= 39:
122
+ return username
123
+
124
+ return None
125
+
126
+ except Exception as e:
127
+ logger.warning(f"Error extracting username from URL {github_url}: {str(e)}")
128
+ return None
129
+
130
+
131
+ def _get_github_user_info(username: str) -> Dict:
132
+ """
133
+ Get basic user information from GitHub API.
134
+
135
+ Args:
136
+ username (str): GitHub username
137
+
138
+ Returns:
139
+ dict: API response with user information
140
+ """
141
+ try:
142
+ url = f"https://api.github.com/users/{username}"
143
+ headers = {
144
+ "Accept": "application/vnd.github.v3+json",
145
+ "User-Agent": "Resumate-App/1.0"
146
+ }
147
+
148
+ response = requests.get(url, headers=headers, timeout=10)
149
+
150
+ if response.status_code == 404:
151
+ return {"status": "error", "message": f"GitHub user '{username}' not found"}
152
+ elif response.status_code == 403:
153
+ return {"status": "error", "message": "GitHub API rate limit exceeded"}
154
+ elif response.status_code != 200:
155
+ return {"status": "error", "message": f"GitHub API error: {response.status_code}"}
156
+
157
+ return {"status": "success", "data": response.json()}
158
+
159
+ except requests.RequestException as e:
160
+ logger.error(f"Network error fetching user info: {str(e)}")
161
+ return {"status": "error", "message": f"Network error: {str(e)}"}
162
+
163
+
164
+ def _get_user_repositories(username: str) -> Dict:
165
+ """
166
+ Get user's public repositories from GitHub API.
167
+
168
+ Args:
169
+ username (str): GitHub username
170
+
171
+ Returns:
172
+ dict: API response with repositories
173
+ """
174
+ try:
175
+ # Get repositories with pagination
176
+ all_repos = []
177
+ page = 1
178
+ per_page = 100 # Maximum allowed by GitHub API
179
+
180
+ while True:
181
+ url = f"https://api.github.com/users/{username}/repos"
182
+ params = {
183
+ "type": "public",
184
+ "sort": "updated",
185
+ "direction": "desc",
186
+ "per_page": per_page,
187
+ "page": page
188
+ }
189
+ headers = {
190
+ "Accept": "application/vnd.github.v3+json",
191
+ "User-Agent": "Resumate-App/1.0"
192
+ }
193
+
194
+ response = requests.get(url, headers=headers, params=params, timeout=10)
195
+
196
+ if response.status_code != 200:
197
+ return {"status": "error", "message": f"GitHub API error: {response.status_code}"}
198
+
199
+ repos = response.json()
200
+ if not repos: # No more repositories
201
+ break
202
+
203
+ all_repos.extend(repos)
204
+
205
+ # If we got less than per_page, we've reached the end
206
+ if len(repos) < per_page:
207
+ break
208
+
209
+ page += 1
210
+
211
+ # Safety limit to prevent infinite loops
212
+ if page > 10: # Max 1000 repos
213
+ break
214
+
215
+ return {"status": "success", "data": all_repos}
216
+
217
+ except requests.RequestException as e:
218
+ logger.error(f"Network error fetching repositories: {str(e)}")
219
+ return {"status": "error", "message": f"Network error: {str(e)}"}
220
+
221
+
222
+ def _process_repository_data(repos: List[Dict]) -> List[Dict]:
223
+ """
224
+ Process and clean repository data for easier consumption.
225
+
226
+ Args:
227
+ repos (List[Dict]): Raw repository data from GitHub API
228
+
229
+ Returns:
230
+ List[Dict]: Processed repository data
231
+ """
232
+ processed = []
233
+
234
+ for repo in repos:
235
+ # Skip forks unless they have significant modifications
236
+ if repo.get("fork", False) and repo.get("stargazers_count", 0) == 0:
237
+ continue
238
+
239
+ processed_repo = {
240
+ "name": repo.get("name", ""),
241
+ "description": repo.get("description", ""),
242
+ "language": repo.get("language", ""),
243
+ "stars": repo.get("stargazers_count", 0),
244
+ "forks": repo.get("forks_count", 0),
245
+ "updated_at": repo.get("updated_at", ""),
246
+ "created_at": repo.get("created_at", ""),
247
+ "html_url": repo.get("html_url", ""),
248
+ "topics": repo.get("topics", []),
249
+ "size": repo.get("size", 0),
250
+ "is_fork": repo.get("fork", False),
251
+ "default_branch": repo.get("default_branch", "main"),
252
+ "has_issues": repo.get("has_issues", False),
253
+ "has_wiki": repo.get("has_wiki", False),
254
+ "has_pages": repo.get("has_pages", False)
255
+ }
256
+
257
+ processed.append(processed_repo)
258
+
259
+ return processed
260
+
261
+
262
+ def format_repositories_for_llm(github_result: Dict) -> str:
263
+ """
264
+ Format GitHub repositories data for LLM consumption.
265
+
266
+ Args:
267
+ github_result (dict): Result from get_github_repositories
268
+
269
+ Returns:
270
+ str: Formatted text ready for LLM context
271
+ """
272
+ if github_result.get("status") != "success":
273
+ return f"GitHub repositories could not be retrieved: {github_result.get('message', 'Unknown error')}"
274
+
275
+ repositories = github_result.get("repositories", [])
276
+ metadata = github_result.get("metadata", {})
277
+
278
+ if not repositories:
279
+ return f"No public repositories found for {metadata.get('username', 'user')}"
280
+
281
+ formatted_parts = [
282
+ "=== GITHUB REPOSITORIES ===\n",
283
+ f"Profile: {metadata.get('profile_url', 'N/A')}",
284
+ f"Username: {metadata.get('username', 'N/A')}",
285
+ f"Public Repositories: {len(repositories)}\n"
286
+ ]
287
+
288
+ for i, repo in enumerate(repositories[:20], 1): # Limit to top 20 repos
289
+ repo_info = [
290
+ f"[REPOSITORY {i}]",
291
+ f"Name: {repo['name']}",
292
+ f"URL: {repo['html_url']}"
293
+ ]
294
+
295
+ if repo['description']:
296
+ repo_info.append(f"Description: {repo['description']}")
297
+
298
+ if repo['language']:
299
+ repo_info.append(f"Primary Language: {repo['language']}")
300
+
301
+ if repo['topics']:
302
+ repo_info.append(f"Topics: {', '.join(repo['topics'][:5])}") # Limit topics
303
+
304
+ repo_info.extend([
305
+ f"Stars: {repo['stars']} | Forks: {repo['forks']}",
306
+ f"Last Updated: {repo['updated_at'][:10]}", # Just the date
307
+ "" # Empty line between repositories
308
+ ])
309
+
310
+ formatted_parts.extend(repo_info)
311
+
312
+ if len(repositories) > 20:
313
+ formatted_parts.append(f"... and {len(repositories) - 20} more repositories")
314
+
315
+ formatted_parts.append("\n=== END GITHUB REPOSITORIES ===")
316
+
317
+ return '\n'.join(formatted_parts)
requirements.txt CHANGED
@@ -1,2 +1,3 @@
1
  gradio==5.35.0
2
- PyPDF2==3.0.1
 
 
1
  gradio==5.35.0
2
+ PyPDF2==3.0.1
3
+ requests==2.31.0
resumate.py CHANGED
@@ -16,6 +16,7 @@ To run:
16
 
17
  import gradio as gr
18
  from functions.linkedin_resume import extract_text_from_linkedin_pdf
 
19
 
20
 
21
  def process_inputs(linkedin_pdf, github_url, job_post_url):
@@ -51,9 +52,34 @@ def process_inputs(linkedin_pdf, github_url, job_post_url):
51
  else:
52
  result += "❌ No LinkedIn resume PDF file uploaded\n\n"
53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  # Process other inputs
55
- result += f"GitHub Profile: {github_url if github_url else 'Not provided'}\n"
56
- result += f"Job Post URL: {job_post_url if job_post_url else 'Not provided'}\n"
57
 
58
  return result
59
 
 
16
 
17
  import gradio as gr
18
  from functions.linkedin_resume import extract_text_from_linkedin_pdf
19
+ from functions.github import get_github_repositories, format_repositories_for_llm
20
 
21
 
22
  def process_inputs(linkedin_pdf, github_url, job_post_url):
 
52
  else:
53
  result += "❌ No LinkedIn resume PDF file uploaded\n\n"
54
 
55
+ # Process GitHub profile
56
+ if github_url and github_url.strip():
57
+ result += f"βœ… GitHub Profile URL provided: {github_url}\n"
58
+
59
+ # Retrieve repositories from GitHub
60
+ github_result = get_github_repositories(github_url)
61
+
62
+ if github_result["status"] == "success":
63
+ metadata = github_result["metadata"]
64
+ repositories = github_result["repositories"]
65
+
66
+ result += f" πŸ“Š GitHub extraction: SUCCESS\n"
67
+ result += f" πŸ‘€ Username: {metadata['username']}\n"
68
+ result += f" πŸ“ Public repositories found: {len(repositories)}\n\n"
69
+
70
+ # Show the formatted repositories for LLM
71
+ result += "πŸ“‚ GITHUB REPOSITORIES (LLM-Ready):\n"
72
+ result += "=" * 60 + "\n"
73
+ result += format_repositories_for_llm(github_result) + "\n"
74
+ result += "=" * 60 + "\n\n"
75
+
76
+ else:
77
+ result += f" ❌ GitHub extraction failed: {github_result['message']}\n\n"
78
+ else:
79
+ result += "❌ No GitHub profile URL provided\n\n"
80
+
81
  # Process other inputs
82
+ result += f"Job Post: {job_post_url if job_post_url else 'Not provided'}\n"
 
83
 
84
  return result
85