Spaces:
Configuration error
Configuration error
""" | |
github.py | |
Functions for retrieving information from GitHub profiles and repositories. | |
""" | |
# import re | |
import json | |
import logging | |
import base64 | |
from typing import List, Dict | |
from pathlib import Path | |
from datetime import datetime | |
import requests | |
# pylint: disable=broad-exception-caught | |
def get_github_repositories(username: str) -> list: | |
""" | |
Retrieve public repositories from a GitHub profile URL. | |
Args: | |
username (str): GitHub username (e.g., username) | |
Returns: | |
dict: List containing dictionaries of repository information | |
Example: | |
[ | |
{ | |
"name": "repo-name", | |
"description": "Repository description", | |
"language": "Python", | |
"stars": 10, | |
"forks": 2, | |
"updated_at": "2024-01-01T00:00:00Z", | |
"html_url": "https://github.com/user/repo", | |
"topics": ["python", "api"], | |
"readme": "# Project Title\n\nProject description..." | |
} | |
] | |
""" | |
logger = logging.getLogger(f'{__name__}.get_github_repositories') | |
try: | |
logger.info("Fetching repositories for GitHub user: %s", username) | |
# Get repositories | |
repositories = _get_user_repositories(username) | |
if repositories: | |
repositories = _process_repository_data(repositories) | |
# Save results to JSON file | |
try: | |
github_repos_dir = Path(__file__).parent.parent / "data" / "github_repos" | |
github_repos_dir.mkdir(parents=True, exist_ok=True) | |
# Create timestamped filename | |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
output_file = github_repos_dir / f"github_repos_{timestamp}.json" | |
with open(output_file, 'w', encoding='utf-8') as f: | |
json.dump(repositories, f, indent=2, ensure_ascii=False) | |
except Exception as save_error: | |
logger.warning("Failed to save GitHub repositories to file: %s", str(save_error)) | |
except Exception as e: | |
logger.error("Error retrieving GitHub repositories: %s", str(e)) | |
return None | |
return repositories | |
def _get_user_repositories(username: str) -> Dict: | |
""" | |
Get user's public repositories from GitHub API. | |
Args: | |
username (str): GitHub username | |
Returns: | |
dict: API response with repositories | |
""" | |
logger = logging.getLogger(f'{__name__}._get_user_repositories') | |
try: | |
# Get repositories with pagination | |
all_repos = [] | |
page = 1 | |
per_page = 100 # Maximum allowed by GitHub API | |
while True: | |
url = f"https://api.github.com/users/{username}/repos" | |
params = { | |
"type": "public", | |
"sort": "updated", | |
"direction": "desc", | |
"per_page": per_page, | |
"page": page | |
} | |
headers = { | |
"Accept": "application/vnd.github.v3+json", | |
"User-Agent": "Resumate-App/1.0" | |
} | |
response = requests.get(url, headers=headers, params=params, timeout=10) | |
if response.status_code != 200: | |
logger.error("GitHub API error: %s", response.status_code) | |
return None | |
repos = response.json() | |
if not repos: # No more repositories | |
break | |
all_repos.extend(repos) | |
# If we got less than per_page, we've reached the end | |
if len(repos) < per_page: | |
break | |
page += 1 | |
# Safety limit to prevent infinite loops | |
if page > 10: # Max 1000 repos | |
break | |
return all_repos | |
except requests.RequestException as e: | |
logger.error("Network error fetching repositories: %s", str(e)) | |
# If we have some repos, return them | |
if len(all_repos) > 0: | |
logger.info("Returning partial repository data due to error") | |
return all_repos | |
else: | |
logger.error("No repositories found and network error occurred") | |
return None | |
def _process_repository_data(repos: List[Dict]) -> List[Dict]: | |
""" | |
Process and clean repository data for easier consumption. | |
Args: | |
repos (List[Dict]): Raw repository data from GitHub API | |
Returns: | |
List[Dict]: Processed repository data | |
""" | |
logger = logging.getLogger(f'{__name__}._process_repository_data') | |
processed = [] | |
for repo in repos: | |
# Skip forks unless they have significant modifications | |
if repo.get("fork", False) and repo.get("stargazers_count", 0) == 0: | |
continue | |
try: | |
processed_repo = { | |
"name": repo.get("name", ""), | |
"description": repo.get("description", ""), | |
"language": repo.get("language", ""), | |
"stars": repo.get("stargazers_count", 0), | |
"forks": repo.get("forks_count", 0), | |
"updated_at": repo.get("updated_at", ""), | |
"created_at": repo.get("created_at", ""), | |
"html_url": repo.get("html_url", ""), | |
"topics": repo.get("topics", []), | |
"size": repo.get("size", 0) | |
} | |
# Get README content for the repository | |
repo_url = repo.get("html_url", "") | |
if repo_url: | |
readme_content = get_repository_readme(repo_url) | |
processed_repo["readme"] = readme_content | |
else: | |
processed_repo["readme"] = "" | |
processed.append(processed_repo) | |
except Exception as e: | |
logger.error("Error processing repository data: %s", str(e)) | |
continue | |
return processed | |
def get_repository_readme(repo_url: str) -> str: | |
""" | |
Get the fulltext content of a repository's README file. | |
Args: | |
repo_url (str): GitHub repository URL (e.g., "https://github.com/owner/repo") | |
Returns: | |
str: README file content as text, or empty string if not found/error | |
Example: | |
>>> readme_content = get_repository_readme("https://github.com/owner/repo") | |
>>> print(readme_content[:100]) | |
# My Project | |
This is a sample project that does... | |
""" | |
logger = logging.getLogger(f'{__name__}.get_repository_readme') | |
try: | |
# Extract owner and repo name from URL | |
if not repo_url.startswith("https://github.com/"): | |
logger.error("Invalid GitHub URL format: %s", repo_url) | |
return "" | |
# Remove trailing slash and split | |
repo_url = repo_url.rstrip("/") | |
parts = repo_url.replace("https://github.com/", "").split("/") | |
if len(parts) != 2: | |
logger.error("Invalid GitHub URL format, expected owner/repo: %s", repo_url) | |
return "" | |
owner, repo = parts | |
logger.info("Fetching README for repository: %s/%s", owner, repo) | |
# GitHub API endpoint for README | |
api_url = f"https://api.github.com/repos/{owner}/{repo}/readme" | |
headers = { | |
"Accept": "application/vnd.github.v3+json", | |
"User-Agent": "Resumate-App/1.0" | |
} | |
response = requests.get(api_url, headers=headers, timeout=10) | |
if response.status_code == 404: | |
logger.info("No README file found for repository: %s/%s", owner, repo) | |
return "" | |
if response.status_code != 200: | |
logger.error("GitHub API error fetching README: %s", response.status_code) | |
return "" | |
readme_data = response.json() | |
# README content is base64 encoded | |
if "content" not in readme_data: | |
logger.error("README API response missing content field") | |
return "" | |
# Decode base64 content | |
encoded_content = readme_data["content"] | |
# Remove any whitespace/newlines from base64 string | |
encoded_content = encoded_content.replace("\n", "").replace(" ", "") | |
try: | |
decoded_content = base64.b64decode(encoded_content).decode('utf-8') | |
logger.info( | |
"Successfully retrieved README content (%d characters)", | |
len(decoded_content) | |
) | |
return decoded_content | |
except Exception as decode_error: | |
logger.error("Error decoding README content: %s", str(decode_error)) | |
return "" | |
except requests.RequestException as e: | |
logger.error("Network error fetching README: %s", str(e)) | |
return "" | |
except Exception as e: | |
logger.error("Error retrieving README: %s", str(e)) | |
return "" | |