resumate / functions /github.py
gperdrizet's picture
Cleaned up
887b877 verified
raw
history blame
8.81 kB
"""
github.py
Functions for retrieving information from GitHub profiles and repositories.
"""
# import re
import json
import logging
import base64
from typing import List, Dict
from pathlib import Path
from datetime import datetime
import requests
# pylint: disable=broad-exception-caught
def get_github_repositories(username: str) -> list:
"""
Retrieve public repositories from a GitHub profile URL.
Args:
username (str): GitHub username (e.g., username)
Returns:
dict: List containing dictionaries of repository information
Example:
[
{
"name": "repo-name",
"description": "Repository description",
"language": "Python",
"stars": 10,
"forks": 2,
"updated_at": "2024-01-01T00:00:00Z",
"html_url": "https://github.com/user/repo",
"topics": ["python", "api"],
"readme": "# Project Title\n\nProject description..."
}
]
"""
logger = logging.getLogger(f'{__name__}.get_github_repositories')
try:
logger.info("Fetching repositories for GitHub user: %s", username)
# Get repositories
repositories = _get_user_repositories(username)
if repositories:
repositories = _process_repository_data(repositories)
# Save results to JSON file
try:
github_repos_dir = Path(__file__).parent.parent / "data" / "github_repos"
github_repos_dir.mkdir(parents=True, exist_ok=True)
# Create timestamped filename
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_file = github_repos_dir / f"github_repos_{timestamp}.json"
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(repositories, f, indent=2, ensure_ascii=False)
except Exception as save_error:
logger.warning("Failed to save GitHub repositories to file: %s", str(save_error))
except Exception as e:
logger.error("Error retrieving GitHub repositories: %s", str(e))
return None
return repositories
def _get_user_repositories(username: str) -> Dict:
"""
Get user's public repositories from GitHub API.
Args:
username (str): GitHub username
Returns:
dict: API response with repositories
"""
logger = logging.getLogger(f'{__name__}._get_user_repositories')
try:
# Get repositories with pagination
all_repos = []
page = 1
per_page = 100 # Maximum allowed by GitHub API
while True:
url = f"https://api.github.com/users/{username}/repos"
params = {
"type": "public",
"sort": "updated",
"direction": "desc",
"per_page": per_page,
"page": page
}
headers = {
"Accept": "application/vnd.github.v3+json",
"User-Agent": "Resumate-App/1.0"
}
response = requests.get(url, headers=headers, params=params, timeout=10)
if response.status_code != 200:
logger.error("GitHub API error: %s", response.status_code)
return None
repos = response.json()
if not repos: # No more repositories
break
all_repos.extend(repos)
# If we got less than per_page, we've reached the end
if len(repos) < per_page:
break
page += 1
# Safety limit to prevent infinite loops
if page > 10: # Max 1000 repos
break
return all_repos
except requests.RequestException as e:
logger.error("Network error fetching repositories: %s", str(e))
# If we have some repos, return them
if len(all_repos) > 0:
logger.info("Returning partial repository data due to error")
return all_repos
else:
logger.error("No repositories found and network error occurred")
return None
def _process_repository_data(repos: List[Dict]) -> List[Dict]:
"""
Process and clean repository data for easier consumption.
Args:
repos (List[Dict]): Raw repository data from GitHub API
Returns:
List[Dict]: Processed repository data
"""
logger = logging.getLogger(f'{__name__}._process_repository_data')
processed = []
for repo in repos:
# Skip forks unless they have significant modifications
if repo.get("fork", False) and repo.get("stargazers_count", 0) == 0:
continue
try:
processed_repo = {
"name": repo.get("name", ""),
"description": repo.get("description", ""),
"language": repo.get("language", ""),
"stars": repo.get("stargazers_count", 0),
"forks": repo.get("forks_count", 0),
"updated_at": repo.get("updated_at", ""),
"created_at": repo.get("created_at", ""),
"html_url": repo.get("html_url", ""),
"topics": repo.get("topics", []),
"size": repo.get("size", 0)
}
# Get README content for the repository
repo_url = repo.get("html_url", "")
if repo_url:
readme_content = get_repository_readme(repo_url)
processed_repo["readme"] = readme_content
else:
processed_repo["readme"] = ""
processed.append(processed_repo)
except Exception as e:
logger.error("Error processing repository data: %s", str(e))
continue
return processed
def get_repository_readme(repo_url: str) -> str:
"""
Get the fulltext content of a repository's README file.
Args:
repo_url (str): GitHub repository URL (e.g., "https://github.com/owner/repo")
Returns:
str: README file content as text, or empty string if not found/error
Example:
>>> readme_content = get_repository_readme("https://github.com/owner/repo")
>>> print(readme_content[:100])
# My Project
This is a sample project that does...
"""
logger = logging.getLogger(f'{__name__}.get_repository_readme')
try:
# Extract owner and repo name from URL
if not repo_url.startswith("https://github.com/"):
logger.error("Invalid GitHub URL format: %s", repo_url)
return ""
# Remove trailing slash and split
repo_url = repo_url.rstrip("/")
parts = repo_url.replace("https://github.com/", "").split("/")
if len(parts) != 2:
logger.error("Invalid GitHub URL format, expected owner/repo: %s", repo_url)
return ""
owner, repo = parts
logger.info("Fetching README for repository: %s/%s", owner, repo)
# GitHub API endpoint for README
api_url = f"https://api.github.com/repos/{owner}/{repo}/readme"
headers = {
"Accept": "application/vnd.github.v3+json",
"User-Agent": "Resumate-App/1.0"
}
response = requests.get(api_url, headers=headers, timeout=10)
if response.status_code == 404:
logger.info("No README file found for repository: %s/%s", owner, repo)
return ""
if response.status_code != 200:
logger.error("GitHub API error fetching README: %s", response.status_code)
return ""
readme_data = response.json()
# README content is base64 encoded
if "content" not in readme_data:
logger.error("README API response missing content field")
return ""
# Decode base64 content
encoded_content = readme_data["content"]
# Remove any whitespace/newlines from base64 string
encoded_content = encoded_content.replace("\n", "").replace(" ", "")
try:
decoded_content = base64.b64decode(encoded_content).decode('utf-8')
logger.info(
"Successfully retrieved README content (%d characters)",
len(decoded_content)
)
return decoded_content
except Exception as decode_error:
logger.error("Error decoding README content: %s", str(decode_error))
return ""
except requests.RequestException as e:
logger.error("Network error fetching README: %s", str(e))
return ""
except Exception as e:
logger.error("Error retrieving README: %s", str(e))
return ""