Spaces:

gperdrizet
/

resumate

Configuration error

App Files Files Community

resumate / functions /github.py

gperdrizet

Cleaned up

887b877 verified about 2 months ago

raw

history blame

8.81 kB

	"""
	github.py

	Functions for retrieving information from GitHub profiles and repositories.
	"""

	# import re
	import json
	import logging
	import base64
	from typing import List, Dict
	from pathlib import Path
	from datetime import datetime

	import requests

	# pylint: disable=broad-exception-caught


	def get_github_repositories(username: str) -> list:
	"""
	Retrieve public repositories from a GitHub profile URL.

	Args:
	username (str): GitHub username (e.g., username)

	Returns:
	dict: List containing dictionaries of repository information

	Example:
	[
	{
	"name": "repo-name",
	"description": "Repository description",
	"language": "Python",
	"stars": 10,
	"forks": 2,
	"updated_at": "2024-01-01T00:00:00Z",
	"html_url": "https://github.com/user/repo",
	"topics": ["python", "api"],
	"readme": "# Project Title\n\nProject description..."
	}
	]
	"""

	logger = logging.getLogger(f'{__name__}.get_github_repositories')

	try:

	logger.info("Fetching repositories for GitHub user: %s", username)

	# Get repositories
	repositories = _get_user_repositories(username)

	if repositories:
	repositories = _process_repository_data(repositories)

	# Save results to JSON file
	try:
	github_repos_dir = Path(__file__).parent.parent / "data" / "github_repos"
	github_repos_dir.mkdir(parents=True, exist_ok=True)

	# Create timestamped filename
	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
	output_file = github_repos_dir / f"github_repos_{timestamp}.json"

	with open(output_file, 'w', encoding='utf-8') as f:
	json.dump(repositories, f, indent=2, ensure_ascii=False)

	except Exception as save_error:
	logger.warning("Failed to save GitHub repositories to file: %s", str(save_error))

	except Exception as e:
	logger.error("Error retrieving GitHub repositories: %s", str(e))

	return None

	return repositories


	def _get_user_repositories(username: str) -> Dict:
	"""
	Get user's public repositories from GitHub API.

	Args:
	username (str): GitHub username

	Returns:
	dict: API response with repositories
	"""

	logger = logging.getLogger(f'{__name__}._get_user_repositories')

	try:
	# Get repositories with pagination
	all_repos = []
	page = 1
	per_page = 100 # Maximum allowed by GitHub API

	while True:

	url = f"https://api.github.com/users/{username}/repos"

	params = {
	"type": "public",
	"sort": "updated",
	"direction": "desc",
	"per_page": per_page,
	"page": page
	}

	headers = {
	"Accept": "application/vnd.github.v3+json",
	"User-Agent": "Resumate-App/1.0"
	}

	response = requests.get(url, headers=headers, params=params, timeout=10)

	if response.status_code != 200:
	logger.error("GitHub API error: %s", response.status_code)
	return None

	repos = response.json()

	if not repos: # No more repositories
	break

	all_repos.extend(repos)

	# If we got less than per_page, we've reached the end
	if len(repos) < per_page:
	break

	page += 1

	# Safety limit to prevent infinite loops
	if page > 10: # Max 1000 repos
	break

	return all_repos

	except requests.RequestException as e:
	logger.error("Network error fetching repositories: %s", str(e))

	# If we have some repos, return them
	if len(all_repos) > 0:
	logger.info("Returning partial repository data due to error")
	return all_repos

	else:
	logger.error("No repositories found and network error occurred")
	return None


	def _process_repository_data(repos: List[Dict]) -> List[Dict]:
	"""
	Process and clean repository data for easier consumption.

	Args:
	repos (List[Dict]): Raw repository data from GitHub API

	Returns:
	List[Dict]: Processed repository data
	"""

	logger = logging.getLogger(f'{__name__}._process_repository_data')

	processed = []

	for repo in repos:

	# Skip forks unless they have significant modifications
	if repo.get("fork", False) and repo.get("stargazers_count", 0) == 0:
	continue

	try:
	processed_repo = {
	"name": repo.get("name", ""),
	"description": repo.get("description", ""),
	"language": repo.get("language", ""),
	"stars": repo.get("stargazers_count", 0),
	"forks": repo.get("forks_count", 0),
	"updated_at": repo.get("updated_at", ""),
	"created_at": repo.get("created_at", ""),
	"html_url": repo.get("html_url", ""),
	"topics": repo.get("topics", []),
	"size": repo.get("size", 0)
	}

	# Get README content for the repository
	repo_url = repo.get("html_url", "")

	if repo_url:
	readme_content = get_repository_readme(repo_url)
	processed_repo["readme"] = readme_content

	else:
	processed_repo["readme"] = ""

	processed.append(processed_repo)

	except Exception as e:
	logger.error("Error processing repository data: %s", str(e))
	continue

	return processed


	def get_repository_readme(repo_url: str) -> str:
	"""
	Get the fulltext content of a repository's README file.

	Args:
	repo_url (str): GitHub repository URL (e.g., "https://github.com/owner/repo")

	Returns:
	str: README file content as text, or empty string if not found/error

	Example:
	>>> readme_content = get_repository_readme("https://github.com/owner/repo")
	>>> print(readme_content[:100])
	# My Project

	This is a sample project that does...
	"""

	logger = logging.getLogger(f'{__name__}.get_repository_readme')

	try:
	# Extract owner and repo name from URL
	if not repo_url.startswith("https://github.com/"):
	logger.error("Invalid GitHub URL format: %s", repo_url)
	return ""

	# Remove trailing slash and split
	repo_url = repo_url.rstrip("/")
	parts = repo_url.replace("https://github.com/", "").split("/")

	if len(parts) != 2:
	logger.error("Invalid GitHub URL format, expected owner/repo: %s", repo_url)
	return ""

	owner, repo = parts

	logger.info("Fetching README for repository: %s/%s", owner, repo)

	# GitHub API endpoint for README
	api_url = f"https://api.github.com/repos/{owner}/{repo}/readme"

	headers = {
	"Accept": "application/vnd.github.v3+json",
	"User-Agent": "Resumate-App/1.0"
	}

	response = requests.get(api_url, headers=headers, timeout=10)

	if response.status_code == 404:
	logger.info("No README file found for repository: %s/%s", owner, repo)
	return ""

	if response.status_code != 200:
	logger.error("GitHub API error fetching README: %s", response.status_code)
	return ""

	readme_data = response.json()

	# README content is base64 encoded
	if "content" not in readme_data:
	logger.error("README API response missing content field")
	return ""

	# Decode base64 content
	encoded_content = readme_data["content"]

	# Remove any whitespace/newlines from base64 string
	encoded_content = encoded_content.replace("\n", "").replace(" ", "")

	try:
	decoded_content = base64.b64decode(encoded_content).decode('utf-8')
	logger.info(
	"Successfully retrieved README content (%d characters)",
	len(decoded_content)
	)

	return decoded_content

	except Exception as decode_error:
	logger.error("Error decoding README content: %s", str(decode_error))
	return ""

	except requests.RequestException as e:
	logger.error("Network error fetching README: %s", str(e))
	return ""

	except Exception as e:
	logger.error("Error retrieving README: %s", str(e))
	return ""