Spaces:
Configuration error
Configuration error

General clean up, added save of results to linkedin resume parsing and github repo list retreival.
b9464fb
verified
""" | |
github.py | |
Functions for retrieving information from GitHub profiles and repositories. | |
""" | |
import re | |
import json | |
import logging | |
from typing import List, Dict, Optional | |
from pathlib import Path | |
import requests | |
# Set up logging | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
def get_github_repositories(github_url: str) -> Dict: | |
""" | |
Retrieve public repositories from a GitHub profile URL. | |
Args: | |
github_url (str): GitHub profile URL (e.g., https://github.com/username) | |
Returns: | |
dict: Dictionary containing status, repositories list, and metadata | |
Example: | |
{ | |
"status": "success", | |
"repositories": [ | |
{ | |
"name": "repo-name", | |
"description": "Repository description", | |
"language": "Python", | |
"stars": 10, | |
"forks": 2, | |
"updated_at": "2024-01-01T00:00:00Z", | |
"html_url": "https://github.com/user/repo", | |
"topics": ["python", "api"] | |
} | |
], | |
"metadata": { | |
"username": "username", | |
"total_repos": 25, | |
"public_repos": 20 | |
}, | |
"message": "Successfully retrieved repositories" | |
} | |
""" | |
if not github_url or not github_url.strip(): | |
return {"status": "error", "message": "No GitHub URL provided"} | |
try: | |
# Extract username from GitHub URL | |
username = _extract_github_username(github_url) | |
if not username: | |
return {"status": "error", "message": "Invalid GitHub URL format"} | |
logger.info("Fetching repositories for GitHub user: %s", username) | |
# Get user info first | |
user_info = _get_github_user_info(username) | |
if user_info["status"] != "success": | |
return user_info | |
# Get repositories | |
repositories = _get_user_repositories(username) | |
if repositories["status"] != "success": | |
return repositories | |
# Process and structure repository data | |
processed_repos = _process_repository_data(repositories["data"]) | |
result = { | |
"status": "success", | |
"repositories": processed_repos, | |
"metadata": { | |
"username": username, | |
"total_repos": user_info["data"].get("public_repos", 0), | |
"public_repos": len(processed_repos), | |
"profile_url": github_url | |
}, | |
"message": f"Successfully retrieved {len(processed_repos)} repositories" | |
} | |
# Save results to JSON file | |
try: | |
data_dir = Path(__file__).parent.parent / "data" | |
data_dir.mkdir(exist_ok=True) | |
output_file = data_dir / "github_repos.json" | |
with open(output_file, 'w', encoding='utf-8') as f: | |
json.dump(result, f, indent=2, ensure_ascii=False) | |
logger.info("GitHub repositories saved to %s", output_file) | |
except Exception as save_error: # pylint: disable=broad-exception-caught | |
logger.warning("Failed to save GitHub repositories to file: %s", str(save_error)) | |
return result | |
except Exception as e: # pylint: disable=broad-exception-caught | |
logger.error("Error retrieving GitHub repositories: %s", str(e)) | |
return { | |
"status": "error", | |
"message": f"Failed to retrieve GitHub repositories: {str(e)}" | |
} | |
def _extract_github_username(github_url: str) -> Optional[str]: | |
""" | |
Extract username from GitHub URL. | |
Args: | |
github_url (str): GitHub profile URL | |
Returns: | |
Optional[str]: Username if valid URL, None otherwise | |
""" | |
try: | |
# Clean up the URL | |
url = github_url.strip().rstrip('/') | |
# Handle various GitHub URL formats | |
patterns = [ | |
r'github\.com/([^/]+)/?$', # https://github.com/username | |
r'github\.com/([^/]+)/.*', # https://github.com/username/anything | |
r'^([a-zA-Z0-9\-_]+)$' # Just username | |
] | |
for pattern in patterns: | |
match = re.search(pattern, url) | |
if match: | |
username = match.group(1) | |
# Validate username format | |
if re.match(r'^[a-zA-Z0-9\-_]+$', username) and len(username) <= 39: | |
return username | |
return None | |
except Exception as e: # pylint: disable=broad-exception-caught | |
logger.warning("Error extracting username from URL %s: %s", github_url, str(e)) | |
return None | |
def _get_github_user_info(username: str) -> Dict: | |
""" | |
Get basic user information from GitHub API. | |
Args: | |
username (str): GitHub username | |
Returns: | |
dict: API response with user information | |
""" | |
try: | |
url = f"https://api.github.com/users/{username}" | |
headers = { | |
"Accept": "application/vnd.github.v3+json", | |
"User-Agent": "Resumate-App/1.0" | |
} | |
response = requests.get(url, headers=headers, timeout=10) | |
if response.status_code == 404: | |
return {"status": "error", "message": f"GitHub user '{username}' not found"} | |
elif response.status_code == 403: | |
return {"status": "error", "message": "GitHub API rate limit exceeded"} | |
elif response.status_code != 200: | |
return {"status": "error", "message": f"GitHub API error: {response.status_code}"} | |
return {"status": "success", "data": response.json()} | |
except requests.RequestException as e: | |
logger.error("Network error fetching user info: %s", str(e)) | |
return {"status": "error", "message": f"Network error: {str(e)}"} | |
def _get_user_repositories(username: str) -> Dict: | |
""" | |
Get user's public repositories from GitHub API. | |
Args: | |
username (str): GitHub username | |
Returns: | |
dict: API response with repositories | |
""" | |
try: | |
# Get repositories with pagination | |
all_repos = [] | |
page = 1 | |
per_page = 100 # Maximum allowed by GitHub API | |
while True: | |
url = f"https://api.github.com/users/{username}/repos" | |
params = { | |
"type": "public", | |
"sort": "updated", | |
"direction": "desc", | |
"per_page": per_page, | |
"page": page | |
} | |
headers = { | |
"Accept": "application/vnd.github.v3+json", | |
"User-Agent": "Resumate-App/1.0" | |
} | |
response = requests.get(url, headers=headers, params=params, timeout=10) | |
if response.status_code != 200: | |
return {"status": "error", "message": f"GitHub API error: {response.status_code}"} | |
repos = response.json() | |
if not repos: # No more repositories | |
break | |
all_repos.extend(repos) | |
# If we got less than per_page, we've reached the end | |
if len(repos) < per_page: | |
break | |
page += 1 | |
# Safety limit to prevent infinite loops | |
if page > 10: # Max 1000 repos | |
break | |
return {"status": "success", "data": all_repos} | |
except requests.RequestException as e: | |
logger.error("Network error fetching repositories: %s", str(e)) | |
return {"status": "error", "message": f"Network error: {str(e)}"} | |
def _process_repository_data(repos: List[Dict]) -> List[Dict]: | |
""" | |
Process and clean repository data for easier consumption. | |
Args: | |
repos (List[Dict]): Raw repository data from GitHub API | |
Returns: | |
List[Dict]: Processed repository data | |
""" | |
processed = [] | |
for repo in repos: | |
# Skip forks unless they have significant modifications | |
if repo.get("fork", False) and repo.get("stargazers_count", 0) == 0: | |
continue | |
processed_repo = { | |
"name": repo.get("name", ""), | |
"description": repo.get("description", ""), | |
"language": repo.get("language", ""), | |
"stars": repo.get("stargazers_count", 0), | |
"forks": repo.get("forks_count", 0), | |
"updated_at": repo.get("updated_at", ""), | |
"created_at": repo.get("created_at", ""), | |
"html_url": repo.get("html_url", ""), | |
"topics": repo.get("topics", []), | |
"size": repo.get("size", 0), | |
"is_fork": repo.get("fork", False), | |
"default_branch": repo.get("default_branch", "main"), | |
"has_issues": repo.get("has_issues", False), | |
"has_wiki": repo.get("has_wiki", False), | |
"has_pages": repo.get("has_pages", False) | |
} | |
processed.append(processed_repo) | |
return processed | |
def format_repositories_for_llm(github_result: Dict) -> str: | |
""" | |
Format GitHub repositories data for LLM consumption. | |
Args: | |
github_result (dict): Result from get_github_repositories | |
Returns: | |
str: Formatted text ready for LLM context | |
""" | |
if github_result.get("status") != "success": | |
return "GitHub repositories could not be retrieved: " + \ | |
f"{github_result.get('message', 'Unknown error')}" | |
repositories = github_result.get("repositories", []) | |
metadata = github_result.get("metadata", {}) | |
if not repositories: | |
return f"No public repositories found for {metadata.get('username', 'user')}" | |
formatted_parts = [ | |
"=== GITHUB REPOSITORIES ===\n", | |
f"Profile: {metadata.get('profile_url', 'N/A')}", | |
f"Username: {metadata.get('username', 'N/A')}", | |
f"Public Repositories: {len(repositories)}\n" | |
] | |
for i, repo in enumerate(repositories[:20], 1): # Limit to top 20 repos | |
repo_info = [ | |
f"[REPOSITORY {i}]", | |
f"Name: {repo['name']}", | |
f"URL: {repo['html_url']}" | |
] | |
if repo['description']: | |
repo_info.append(f"Description: {repo['description']}") | |
if repo['language']: | |
repo_info.append(f"Primary Language: {repo['language']}") | |
if repo['topics']: | |
repo_info.append(f"Topics: {', '.join(repo['topics'][:5])}") # Limit topics | |
repo_info.extend([ | |
f"Stars: {repo['stars']} | Forks: {repo['forks']}", | |
f"Last Updated: {repo['updated_at'][:10]}", # Just the date | |
"" # Empty line between repositories | |
]) | |
formatted_parts.extend(repo_info) | |
if len(repositories) > 20: | |
formatted_parts.append(f"... and {len(repositories) - 20} more repositories") | |
formatted_parts.append("\n=== END GITHUB REPOSITORIES ===") | |
return '\n'.join(formatted_parts) | |