resumate / functions /github.py
gperdrizet's picture
General clean up, added save of results to linkedin resume parsing and github repo list retreival.
b9464fb verified
raw
history blame
10.9 kB
"""
github.py
Functions for retrieving information from GitHub profiles and repositories.
"""
import re
import json
import logging
from typing import List, Dict, Optional
from pathlib import Path
import requests
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def get_github_repositories(github_url: str) -> Dict:
"""
Retrieve public repositories from a GitHub profile URL.
Args:
github_url (str): GitHub profile URL (e.g., https://github.com/username)
Returns:
dict: Dictionary containing status, repositories list, and metadata
Example:
{
"status": "success",
"repositories": [
{
"name": "repo-name",
"description": "Repository description",
"language": "Python",
"stars": 10,
"forks": 2,
"updated_at": "2024-01-01T00:00:00Z",
"html_url": "https://github.com/user/repo",
"topics": ["python", "api"]
}
],
"metadata": {
"username": "username",
"total_repos": 25,
"public_repos": 20
},
"message": "Successfully retrieved repositories"
}
"""
if not github_url or not github_url.strip():
return {"status": "error", "message": "No GitHub URL provided"}
try:
# Extract username from GitHub URL
username = _extract_github_username(github_url)
if not username:
return {"status": "error", "message": "Invalid GitHub URL format"}
logger.info("Fetching repositories for GitHub user: %s", username)
# Get user info first
user_info = _get_github_user_info(username)
if user_info["status"] != "success":
return user_info
# Get repositories
repositories = _get_user_repositories(username)
if repositories["status"] != "success":
return repositories
# Process and structure repository data
processed_repos = _process_repository_data(repositories["data"])
result = {
"status": "success",
"repositories": processed_repos,
"metadata": {
"username": username,
"total_repos": user_info["data"].get("public_repos", 0),
"public_repos": len(processed_repos),
"profile_url": github_url
},
"message": f"Successfully retrieved {len(processed_repos)} repositories"
}
# Save results to JSON file
try:
data_dir = Path(__file__).parent.parent / "data"
data_dir.mkdir(exist_ok=True)
output_file = data_dir / "github_repos.json"
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(result, f, indent=2, ensure_ascii=False)
logger.info("GitHub repositories saved to %s", output_file)
except Exception as save_error: # pylint: disable=broad-exception-caught
logger.warning("Failed to save GitHub repositories to file: %s", str(save_error))
return result
except Exception as e: # pylint: disable=broad-exception-caught
logger.error("Error retrieving GitHub repositories: %s", str(e))
return {
"status": "error",
"message": f"Failed to retrieve GitHub repositories: {str(e)}"
}
def _extract_github_username(github_url: str) -> Optional[str]:
"""
Extract username from GitHub URL.
Args:
github_url (str): GitHub profile URL
Returns:
Optional[str]: Username if valid URL, None otherwise
"""
try:
# Clean up the URL
url = github_url.strip().rstrip('/')
# Handle various GitHub URL formats
patterns = [
r'github\.com/([^/]+)/?$', # https://github.com/username
r'github\.com/([^/]+)/.*', # https://github.com/username/anything
r'^([a-zA-Z0-9\-_]+)$' # Just username
]
for pattern in patterns:
match = re.search(pattern, url)
if match:
username = match.group(1)
# Validate username format
if re.match(r'^[a-zA-Z0-9\-_]+$', username) and len(username) <= 39:
return username
return None
except Exception as e: # pylint: disable=broad-exception-caught
logger.warning("Error extracting username from URL %s: %s", github_url, str(e))
return None
def _get_github_user_info(username: str) -> Dict:
"""
Get basic user information from GitHub API.
Args:
username (str): GitHub username
Returns:
dict: API response with user information
"""
try:
url = f"https://api.github.com/users/{username}"
headers = {
"Accept": "application/vnd.github.v3+json",
"User-Agent": "Resumate-App/1.0"
}
response = requests.get(url, headers=headers, timeout=10)
if response.status_code == 404:
return {"status": "error", "message": f"GitHub user '{username}' not found"}
elif response.status_code == 403:
return {"status": "error", "message": "GitHub API rate limit exceeded"}
elif response.status_code != 200:
return {"status": "error", "message": f"GitHub API error: {response.status_code}"}
return {"status": "success", "data": response.json()}
except requests.RequestException as e:
logger.error("Network error fetching user info: %s", str(e))
return {"status": "error", "message": f"Network error: {str(e)}"}
def _get_user_repositories(username: str) -> Dict:
"""
Get user's public repositories from GitHub API.
Args:
username (str): GitHub username
Returns:
dict: API response with repositories
"""
try:
# Get repositories with pagination
all_repos = []
page = 1
per_page = 100 # Maximum allowed by GitHub API
while True:
url = f"https://api.github.com/users/{username}/repos"
params = {
"type": "public",
"sort": "updated",
"direction": "desc",
"per_page": per_page,
"page": page
}
headers = {
"Accept": "application/vnd.github.v3+json",
"User-Agent": "Resumate-App/1.0"
}
response = requests.get(url, headers=headers, params=params, timeout=10)
if response.status_code != 200:
return {"status": "error", "message": f"GitHub API error: {response.status_code}"}
repos = response.json()
if not repos: # No more repositories
break
all_repos.extend(repos)
# If we got less than per_page, we've reached the end
if len(repos) < per_page:
break
page += 1
# Safety limit to prevent infinite loops
if page > 10: # Max 1000 repos
break
return {"status": "success", "data": all_repos}
except requests.RequestException as e:
logger.error("Network error fetching repositories: %s", str(e))
return {"status": "error", "message": f"Network error: {str(e)}"}
def _process_repository_data(repos: List[Dict]) -> List[Dict]:
"""
Process and clean repository data for easier consumption.
Args:
repos (List[Dict]): Raw repository data from GitHub API
Returns:
List[Dict]: Processed repository data
"""
processed = []
for repo in repos:
# Skip forks unless they have significant modifications
if repo.get("fork", False) and repo.get("stargazers_count", 0) == 0:
continue
processed_repo = {
"name": repo.get("name", ""),
"description": repo.get("description", ""),
"language": repo.get("language", ""),
"stars": repo.get("stargazers_count", 0),
"forks": repo.get("forks_count", 0),
"updated_at": repo.get("updated_at", ""),
"created_at": repo.get("created_at", ""),
"html_url": repo.get("html_url", ""),
"topics": repo.get("topics", []),
"size": repo.get("size", 0),
"is_fork": repo.get("fork", False),
"default_branch": repo.get("default_branch", "main"),
"has_issues": repo.get("has_issues", False),
"has_wiki": repo.get("has_wiki", False),
"has_pages": repo.get("has_pages", False)
}
processed.append(processed_repo)
return processed
def format_repositories_for_llm(github_result: Dict) -> str:
"""
Format GitHub repositories data for LLM consumption.
Args:
github_result (dict): Result from get_github_repositories
Returns:
str: Formatted text ready for LLM context
"""
if github_result.get("status") != "success":
return "GitHub repositories could not be retrieved: " + \
f"{github_result.get('message', 'Unknown error')}"
repositories = github_result.get("repositories", [])
metadata = github_result.get("metadata", {})
if not repositories:
return f"No public repositories found for {metadata.get('username', 'user')}"
formatted_parts = [
"=== GITHUB REPOSITORIES ===\n",
f"Profile: {metadata.get('profile_url', 'N/A')}",
f"Username: {metadata.get('username', 'N/A')}",
f"Public Repositories: {len(repositories)}\n"
]
for i, repo in enumerate(repositories[:20], 1): # Limit to top 20 repos
repo_info = [
f"[REPOSITORY {i}]",
f"Name: {repo['name']}",
f"URL: {repo['html_url']}"
]
if repo['description']:
repo_info.append(f"Description: {repo['description']}")
if repo['language']:
repo_info.append(f"Primary Language: {repo['language']}")
if repo['topics']:
repo_info.append(f"Topics: {', '.join(repo['topics'][:5])}") # Limit topics
repo_info.extend([
f"Stars: {repo['stars']} | Forks: {repo['forks']}",
f"Last Updated: {repo['updated_at'][:10]}", # Just the date
"" # Empty line between repositories
])
formatted_parts.extend(repo_info)
if len(repositories) > 20:
formatted_parts.append(f"... and {len(repositories) - 20} more repositories")
formatted_parts.append("\n=== END GITHUB REPOSITORIES ===")
return '\n'.join(formatted_parts)