""" github.py Functions for retrieving information from GitHub profiles and repositories. """ import re import json import logging from typing import List, Dict, Optional from pathlib import Path import requests # Set up logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) def get_github_repositories(github_url: str) -> Dict: """ Retrieve public repositories from a GitHub profile URL. Args: github_url (str): GitHub profile URL (e.g., https://github.com/username) Returns: dict: Dictionary containing status, repositories list, and metadata Example: { "status": "success", "repositories": [ { "name": "repo-name", "description": "Repository description", "language": "Python", "stars": 10, "forks": 2, "updated_at": "2024-01-01T00:00:00Z", "html_url": "https://github.com/user/repo", "topics": ["python", "api"] } ], "metadata": { "username": "username", "total_repos": 25, "public_repos": 20 }, "message": "Successfully retrieved repositories" } """ if not github_url or not github_url.strip(): return {"status": "error", "message": "No GitHub URL provided"} try: # Extract username from GitHub URL username = _extract_github_username(github_url) if not username: return {"status": "error", "message": "Invalid GitHub URL format"} logger.info("Fetching repositories for GitHub user: %s", username) # Get user info first user_info = _get_github_user_info(username) if user_info["status"] != "success": return user_info # Get repositories repositories = _get_user_repositories(username) if repositories["status"] != "success": return repositories # Process and structure repository data processed_repos = _process_repository_data(repositories["data"]) result = { "status": "success", "repositories": processed_repos, "metadata": { "username": username, "total_repos": user_info["data"].get("public_repos", 0), "public_repos": len(processed_repos), "profile_url": github_url }, "message": f"Successfully retrieved {len(processed_repos)} repositories" } # Save results to JSON file try: data_dir = Path(__file__).parent.parent / "data" data_dir.mkdir(exist_ok=True) output_file = data_dir / "github_repos.json" with open(output_file, 'w', encoding='utf-8') as f: json.dump(result, f, indent=2, ensure_ascii=False) logger.info("GitHub repositories saved to %s", output_file) except Exception as save_error: # pylint: disable=broad-exception-caught logger.warning("Failed to save GitHub repositories to file: %s", str(save_error)) return result except Exception as e: # pylint: disable=broad-exception-caught logger.error("Error retrieving GitHub repositories: %s", str(e)) return { "status": "error", "message": f"Failed to retrieve GitHub repositories: {str(e)}" } def _extract_github_username(github_url: str) -> Optional[str]: """ Extract username from GitHub URL. Args: github_url (str): GitHub profile URL Returns: Optional[str]: Username if valid URL, None otherwise """ try: # Clean up the URL url = github_url.strip().rstrip('/') # Handle various GitHub URL formats patterns = [ r'github\.com/([^/]+)/?$', # https://github.com/username r'github\.com/([^/]+)/.*', # https://github.com/username/anything r'^([a-zA-Z0-9\-_]+)$' # Just username ] for pattern in patterns: match = re.search(pattern, url) if match: username = match.group(1) # Validate username format if re.match(r'^[a-zA-Z0-9\-_]+$', username) and len(username) <= 39: return username return None except Exception as e: # pylint: disable=broad-exception-caught logger.warning("Error extracting username from URL %s: %s", github_url, str(e)) return None def _get_github_user_info(username: str) -> Dict: """ Get basic user information from GitHub API. Args: username (str): GitHub username Returns: dict: API response with user information """ try: url = f"https://api.github.com/users/{username}" headers = { "Accept": "application/vnd.github.v3+json", "User-Agent": "Resumate-App/1.0" } response = requests.get(url, headers=headers, timeout=10) if response.status_code == 404: return {"status": "error", "message": f"GitHub user '{username}' not found"} elif response.status_code == 403: return {"status": "error", "message": "GitHub API rate limit exceeded"} elif response.status_code != 200: return {"status": "error", "message": f"GitHub API error: {response.status_code}"} return {"status": "success", "data": response.json()} except requests.RequestException as e: logger.error("Network error fetching user info: %s", str(e)) return {"status": "error", "message": f"Network error: {str(e)}"} def _get_user_repositories(username: str) -> Dict: """ Get user's public repositories from GitHub API. Args: username (str): GitHub username Returns: dict: API response with repositories """ try: # Get repositories with pagination all_repos = [] page = 1 per_page = 100 # Maximum allowed by GitHub API while True: url = f"https://api.github.com/users/{username}/repos" params = { "type": "public", "sort": "updated", "direction": "desc", "per_page": per_page, "page": page } headers = { "Accept": "application/vnd.github.v3+json", "User-Agent": "Resumate-App/1.0" } response = requests.get(url, headers=headers, params=params, timeout=10) if response.status_code != 200: return {"status": "error", "message": f"GitHub API error: {response.status_code}"} repos = response.json() if not repos: # No more repositories break all_repos.extend(repos) # If we got less than per_page, we've reached the end if len(repos) < per_page: break page += 1 # Safety limit to prevent infinite loops if page > 10: # Max 1000 repos break return {"status": "success", "data": all_repos} except requests.RequestException as e: logger.error("Network error fetching repositories: %s", str(e)) return {"status": "error", "message": f"Network error: {str(e)}"} def _process_repository_data(repos: List[Dict]) -> List[Dict]: """ Process and clean repository data for easier consumption. Args: repos (List[Dict]): Raw repository data from GitHub API Returns: List[Dict]: Processed repository data """ processed = [] for repo in repos: # Skip forks unless they have significant modifications if repo.get("fork", False) and repo.get("stargazers_count", 0) == 0: continue processed_repo = { "name": repo.get("name", ""), "description": repo.get("description", ""), "language": repo.get("language", ""), "stars": repo.get("stargazers_count", 0), "forks": repo.get("forks_count", 0), "updated_at": repo.get("updated_at", ""), "created_at": repo.get("created_at", ""), "html_url": repo.get("html_url", ""), "topics": repo.get("topics", []), "size": repo.get("size", 0), "is_fork": repo.get("fork", False), "default_branch": repo.get("default_branch", "main"), "has_issues": repo.get("has_issues", False), "has_wiki": repo.get("has_wiki", False), "has_pages": repo.get("has_pages", False) } processed.append(processed_repo) return processed def format_repositories_for_llm(github_result: Dict) -> str: """ Format GitHub repositories data for LLM consumption. Args: github_result (dict): Result from get_github_repositories Returns: str: Formatted text ready for LLM context """ if github_result.get("status") != "success": return "GitHub repositories could not be retrieved: " + \ f"{github_result.get('message', 'Unknown error')}" repositories = github_result.get("repositories", []) metadata = github_result.get("metadata", {}) if not repositories: return f"No public repositories found for {metadata.get('username', 'user')}" formatted_parts = [ "=== GITHUB REPOSITORIES ===\n", f"Profile: {metadata.get('profile_url', 'N/A')}", f"Username: {metadata.get('username', 'N/A')}", f"Public Repositories: {len(repositories)}\n" ] for i, repo in enumerate(repositories[:20], 1): # Limit to top 20 repos repo_info = [ f"[REPOSITORY {i}]", f"Name: {repo['name']}", f"URL: {repo['html_url']}" ] if repo['description']: repo_info.append(f"Description: {repo['description']}") if repo['language']: repo_info.append(f"Primary Language: {repo['language']}") if repo['topics']: repo_info.append(f"Topics: {', '.join(repo['topics'][:5])}") # Limit topics repo_info.extend([ f"Stars: {repo['stars']} | Forks: {repo['forks']}", f"Last Updated: {repo['updated_at'][:10]}", # Just the date "" # Empty line between repositories ]) formatted_parts.extend(repo_info) if len(repositories) > 20: formatted_parts.append(f"... and {len(repositories) - 20} more repositories") formatted_parts.append("\n=== END GITHUB REPOSITORIES ===") return '\n'.join(formatted_parts)