""" github.py Functions for retrieving information from GitHub profiles and repositories. """ import re import json import logging from typing import List, Dict, Optional from pathlib import Path import requests # pylint: disable=broad-exception-caught # Set up logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) def get_github_repositories(github_url: str) -> Dict: """ Retrieve public repositories from a GitHub profile URL. Args: github_url (str): GitHub profile URL (e.g., https://github.com/username) Returns: dict: Dictionary containing status, repositories list, and metadata Example: { "status": "success", "repositories": [ { "name": "repo-name", "description": "Repository description", "language": "Python", "stars": 10, "forks": 2, "updated_at": "2024-01-01T00:00:00Z", "html_url": "https://github.com/user/repo", "topics": ["python", "api"] } ], "metadata": { "username": "username", "total_repos": 25, "public_repos": 20 }, "message": "Successfully retrieved repositories" } """ if not github_url or not github_url.strip(): return {"status": "error", "message": "No GitHub URL provided"} try: # Extract username from GitHub URL username = _extract_github_username(github_url) if not username: return {"status": "error", "message": "Invalid GitHub URL format"} logger.info("Fetching repositories for GitHub user: %s", username) # Get user info first user_info = _get_github_user_info(username) if user_info["status"] != "success": return user_info # Get repositories repositories = _get_user_repositories(username) if repositories["status"] != "success": return repositories # Process and structure repository data processed_repos = _process_repository_data(repositories["data"]) result = { "status": "success", "repositories": processed_repos, "metadata": { "username": username, "total_repos": user_info["data"].get("public_repos", 0), "public_repos": len(processed_repos), "profile_url": github_url }, "message": f"Successfully retrieved {len(processed_repos)} repositories" } # Save results to JSON file try: github_repos_dir = Path(__file__).parent.parent / "data" / "github_repos" github_repos_dir.mkdir(parents=True, exist_ok=True) output_file = github_repos_dir / "github_repos.json" with open(output_file, 'w', encoding='utf-8') as f: json.dump(result, f, indent=2, ensure_ascii=False) logger.info("GitHub repositories saved to %s", output_file) except Exception as save_error: logger.warning("Failed to save GitHub repositories to file: %s", str(save_error)) return result except Exception as e: logger.error("Error retrieving GitHub repositories: %s", str(e)) return { "status": "error", "message": f"Failed to retrieve GitHub repositories: {str(e)}" } def _extract_github_username(github_url: str) -> Optional[str]: """ Extract username from GitHub URL. Args: github_url (str): GitHub profile URL Returns: Optional[str]: Username if valid URL, None otherwise """ try: # Clean up the URL url = github_url.strip().rstrip('/') # Handle various GitHub URL formats patterns = [ r'github\.com/([^/]+)/?$', # https://github.com/username r'github\.com/([^/]+)/.*', # https://github.com/username/anything r'^([a-zA-Z0-9\-_]+)$' # Just username ] for pattern in patterns: match = re.search(pattern, url) if match: username = match.group(1) # Validate username format if re.match(r'^[a-zA-Z0-9\-_]+$', username) and len(username) <= 39: return username return None except Exception as e: logger.warning("Error extracting username from URL %s: %s", github_url, str(e)) return None def _get_github_user_info(username: str) -> Dict: """ Get basic user information from GitHub API. Args: username (str): GitHub username Returns: dict: API response with user information """ try: url = f"https://api.github.com/users/{username}" headers = { "Accept": "application/vnd.github.v3+json", "User-Agent": "Resumate-App/1.0" } response = requests.get(url, headers=headers, timeout=10) if response.status_code == 404: return {"status": "error", "message": f"GitHub user '{username}' not found"} elif response.status_code == 403: return {"status": "error", "message": "GitHub API rate limit exceeded"} elif response.status_code != 200: return {"status": "error", "message": f"GitHub API error: {response.status_code}"} return {"status": "success", "data": response.json()} except requests.RequestException as e: logger.error("Network error fetching user info: %s", str(e)) return {"status": "error", "message": f"Network error: {str(e)}"} def _get_user_repositories(username: str) -> Dict: """ Get user's public repositories from GitHub API. Args: username (str): GitHub username Returns: dict: API response with repositories """ try: # Get repositories with pagination all_repos = [] page = 1 per_page = 100 # Maximum allowed by GitHub API while True: url = f"https://api.github.com/users/{username}/repos" params = { "type": "public", "sort": "updated", "direction": "desc", "per_page": per_page, "page": page } headers = { "Accept": "application/vnd.github.v3+json", "User-Agent": "Resumate-App/1.0" } response = requests.get(url, headers=headers, params=params, timeout=10) if response.status_code != 200: return {"status": "error", "message": f"GitHub API error: {response.status_code}"} repos = response.json() if not repos: # No more repositories break all_repos.extend(repos) # If we got less than per_page, we've reached the end if len(repos) < per_page: break page += 1 # Safety limit to prevent infinite loops if page > 10: # Max 1000 repos break return {"status": "success", "data": all_repos} except requests.RequestException as e: logger.error("Network error fetching repositories: %s", str(e)) return {"status": "error", "message": f"Network error: {str(e)}"} def _process_repository_data(repos: List[Dict]) -> List[Dict]: """ Process and clean repository data for easier consumption. Args: repos (List[Dict]): Raw repository data from GitHub API Returns: List[Dict]: Processed repository data """ processed = [] for repo in repos: # Skip forks unless they have significant modifications if repo.get("fork", False) and repo.get("stargazers_count", 0) == 0: continue processed_repo = { "name": repo.get("name", ""), "description": repo.get("description", ""), "language": repo.get("language", ""), "stars": repo.get("stargazers_count", 0), "forks": repo.get("forks_count", 0), "updated_at": repo.get("updated_at", ""), "created_at": repo.get("created_at", ""), "html_url": repo.get("html_url", ""), "topics": repo.get("topics", []), "size": repo.get("size", 0), "is_fork": repo.get("fork", False), "default_branch": repo.get("default_branch", "main"), "has_issues": repo.get("has_issues", False), "has_wiki": repo.get("has_wiki", False), "has_pages": repo.get("has_pages", False) } processed.append(processed_repo) return processed def format_repositories_for_llm(github_result: Dict) -> str: """ Format GitHub repositories data for LLM consumption. Args: github_result (dict): Result from get_github_repositories Returns: str: Formatted text ready for LLM context """ if github_result.get("status") != "success": return "GitHub repositories could not be retrieved: " + \ f"{github_result.get('message', 'Unknown error')}" repositories = github_result.get("repositories", []) metadata = github_result.get("metadata", {}) if not repositories: return f"No public repositories found for {metadata.get('username', 'user')}" formatted_parts = [ "=== GITHUB REPOSITORIES ===\n", f"Profile: {metadata.get('profile_url', 'N/A')}", f"Username: {metadata.get('username', 'N/A')}", f"Public Repositories: {len(repositories)}\n" ] for i, repo in enumerate(repositories[:20], 1): # Limit to top 20 repos repo_info = [ f"[REPOSITORY {i}]", f"Name: {repo['name']}", f"URL: {repo['html_url']}" ] if repo['description']: repo_info.append(f"Description: {repo['description']}") if repo['language']: repo_info.append(f"Primary Language: {repo['language']}") if repo['topics']: repo_info.append(f"Topics: {', '.join(repo['topics'][:5])}") # Limit topics repo_info.extend([ f"Stars: {repo['stars']} | Forks: {repo['forks']}", f"Last Updated: {repo['updated_at'][:10]}", # Just the date "" # Empty line between repositories ]) formatted_parts.extend(repo_info) if len(repositories) > 20: formatted_parts.append(f"... and {len(repositories) - 20} more repositories") formatted_parts.append("\n=== END GITHUB REPOSITORIES ===") return '\n'.join(formatted_parts) def get_repository_details(repo_url: str) -> Dict: """ Get detailed information about a specific GitHub repository. Args: repo_url (str): GitHub repository URL (e.g., https://github.com/user/repo) Returns: dict: Dictionary containing comprehensive repository information Example: { "status": "success", "repository": { "name": "repo-name", "full_name": "user/repo-name", "description": "Repository description", "language": "Python", "languages": {"Python": 85.5, "JavaScript": 14.5}, "stars": 100, "forks": 25, "watchers": 50, "size": 1024, "created_at": "2024-01-01T00:00:00Z", "updated_at": "2024-01-15T00:00:00Z", "pushed_at": "2024-01-15T00:00:00Z", "html_url": "https://github.com/user/repo", "clone_url": "https://github.com/user/repo.git", "topics": ["python", "api", "web"], "license": {"name": "MIT License", "spdx_id": "MIT"}, "readme": "README content here...", "file_structure": ["src/", "tests/", "README.md", "setup.py"], "releases": [{"tag_name": "v1.0.0", "name": "Release 1.0.0"}], "contributors": [{"login": "user1", "contributions": 50}], "is_fork": false, "is_archived": false, "is_private": false, "default_branch": "main", "open_issues": 5, "has_issues": true, "has_wiki": true, "has_pages": false }, "message": "Successfully retrieved repository details" } """ if not repo_url or not repo_url.strip(): return {"status": "error", "message": "No repository URL provided"} try: # Extract owner and repo name from URL owner, repo_name = _extract_repo_info(repo_url) if not owner or not repo_name: return {"status": "error", "message": "Invalid GitHub repository URL format"} logger.info("Fetching detailed information for repository: %s/%s", owner, repo_name) # Get basic repository information repo_info = _get_repository_info(owner, repo_name) if repo_info["status"] != "success": return repo_info repo_data = repo_info["data"] # Get additional repository details additional_data = {} # Get languages languages_result = _get_repository_languages(owner, repo_name) if languages_result["status"] == "success": additional_data["languages"] = languages_result["data"] # Get README content readme_result = _get_repository_readme(owner, repo_name) if readme_result["status"] == "success": additional_data["readme"] = readme_result["data"] # Get file structure (root directory) file_structure_result = _get_repository_contents(owner, repo_name) if file_structure_result["status"] == "success": additional_data["file_structure"] = file_structure_result["data"] # Get releases releases_result = _get_repository_releases(owner, repo_name) if releases_result["status"] == "success": additional_data["releases"] = releases_result["data"] # Get contributors contributors_result = _get_repository_contributors(owner, repo_name) if contributors_result["status"] == "success": additional_data["contributors"] = contributors_result["data"] # Combine all data repository_details = { "name": repo_data.get("name", ""), "full_name": repo_data.get("full_name", ""), "description": repo_data.get("description", ""), "language": repo_data.get("language", ""), "languages": additional_data.get("languages", {}), "stars": repo_data.get("stargazers_count", 0), "forks": repo_data.get("forks_count", 0), "watchers": repo_data.get("watchers_count", 0), "size": repo_data.get("size", 0), "created_at": repo_data.get("created_at", ""), "updated_at": repo_data.get("updated_at", ""), "pushed_at": repo_data.get("pushed_at", ""), "html_url": repo_data.get("html_url", ""), "clone_url": repo_data.get("clone_url", ""), "ssh_url": repo_data.get("ssh_url", ""), "topics": repo_data.get("topics", []), "license": repo_data.get("license", {}), "readme": additional_data.get("readme", ""), "file_structure": additional_data.get("file_structure", []), "releases": additional_data.get("releases", []), "contributors": additional_data.get("contributors", []), "is_fork": repo_data.get("fork", False), "is_archived": repo_data.get("archived", False), "is_private": repo_data.get("private", False), "default_branch": repo_data.get("default_branch", "main"), "open_issues": repo_data.get("open_issues_count", 0), "has_issues": repo_data.get("has_issues", False), "has_wiki": repo_data.get("has_wiki", False), "has_pages": repo_data.get("has_pages", False), "has_projects": repo_data.get("has_projects", False), "visibility": repo_data.get("visibility", "public") } result = { "status": "success", "repository": repository_details, "message": f"Successfully retrieved details for {owner}/{repo_name}" } # Save results to JSON file try: github_repos_dir = Path(__file__).parent.parent / "data" / "github_repos" github_repos_dir.mkdir(parents=True, exist_ok=True) output_file = github_repos_dir / f"repo_details_{owner}_{repo_name}.json" with open(output_file, 'w', encoding='utf-8') as f: json.dump(result, f, indent=2, ensure_ascii=False) logger.info("Repository details saved to %s", output_file) except Exception as save_error: logger.warning("Failed to save repository details to file: %s", str(save_error)) return result except Exception as e: logger.error("Error retrieving repository details: %s", str(e)) return { "status": "error", "message": f"Failed to retrieve repository details: {str(e)}" } def _extract_repo_info(repo_url: str) -> tuple: """ Extract owner and repository name from GitHub repository URL. Args: repo_url (str): GitHub repository URL Returns: tuple: (owner, repo_name) if valid URL, (None, None) otherwise """ try: # Clean up the URL url = repo_url.strip().rstrip('/') # Handle various GitHub repository URL formats patterns = [ r'github\.com/([^/]+)/([^/]+)/?$', # https://github.com/owner/repo r'github\.com/([^/]+)/([^/]+)/.*', # https://github.com/owner/repo/anything ] for pattern in patterns: match = re.search(pattern, url) if match: owner = match.group(1) repo_name = match.group(2) # Remove .git suffix if present if repo_name.endswith('.git'): repo_name = repo_name[:-4] # Validate format if (re.match(r'^[a-zA-Z0-9\-_\.]+$', owner) and re.match(r'^[a-zA-Z0-9\-_\.]+$', repo_name)): return owner, repo_name return None, None except Exception as e: logger.warning("Error extracting repo info from URL %s: %s", repo_url, str(e)) return None, None def _get_repository_info(owner: str, repo_name: str) -> Dict: """Get basic repository information from GitHub API.""" try: url = f"https://api.github.com/repos/{owner}/{repo_name}" headers = { "Accept": "application/vnd.github.v3+json", "User-Agent": "Resumate-App/1.0" } response = requests.get(url, headers=headers, timeout=10) if response.status_code == 404: return {"status": "error", "message": f"Repository '{owner}/{repo_name}' not found"} elif response.status_code == 403: return {"status": "error", "message": "GitHub API rate limit exceeded"} elif response.status_code != 200: return {"status": "error", "message": f"GitHub API error: {response.status_code}"} return {"status": "success", "data": response.json()} except requests.RequestException as e: logger.error("Network error fetching repository info: %s", str(e)) return {"status": "error", "message": f"Network error: {str(e)}"} def _get_repository_languages(owner: str, repo_name: str) -> Dict: """Get repository languages from GitHub API.""" try: url = f"https://api.github.com/repos/{owner}/{repo_name}/languages" headers = { "Accept": "application/vnd.github.v3+json", "User-Agent": "Resumate-App/1.0" } response = requests.get(url, headers=headers, timeout=10) if response.status_code == 200: # Convert byte counts to percentages languages = response.json() total_bytes = sum(languages.values()) if total_bytes > 0: language_percentages = { lang: round((bytes_count / total_bytes) * 100, 1) for lang, bytes_count in languages.items() } return {"status": "success", "data": language_percentages} return {"status": "error", "message": "Could not retrieve languages"} except Exception as e: logger.warning("Error fetching repository languages: %s", str(e)) return {"status": "error", "message": str(e)} def _get_repository_readme(owner: str, repo_name: str) -> Dict: """Get repository README content from GitHub API.""" try: url = f"https://api.github.com/repos/{owner}/{repo_name}/readme" headers = { "Accept": "application/vnd.github.v3+json", "User-Agent": "Resumate-App/1.0" } response = requests.get(url, headers=headers, timeout=10) if response.status_code == 200: readme_data = response.json() # Get the raw content URL and fetch it download_url = readme_data.get("download_url") if download_url: content_response = requests.get(download_url, timeout=10) if content_response.status_code == 200: return {"status": "success", "data": content_response.text} return {"status": "error", "message": "README not found"} except Exception as e: logger.warning("Error fetching README: %s", str(e)) return {"status": "error", "message": str(e)} def _get_repository_contents(owner: str, repo_name: str, path: str = "") -> Dict: """Get repository contents (file structure) from GitHub API.""" try: url = f"https://api.github.com/repos/{owner}/{repo_name}/contents/{path}" headers = { "Accept": "application/vnd.github.v3+json", "User-Agent": "Resumate-App/1.0" } response = requests.get(url, headers=headers, timeout=10) if response.status_code == 200: contents = response.json() # Extract file and directory names file_structure = [] for item in contents: name = item.get("name", "") if item.get("type") == "dir": name += "/" file_structure.append(name) # Sort with directories first file_structure.sort(key=lambda x: (not x.endswith("/"), x.lower())) return {"status": "success", "data": file_structure} return {"status": "error", "message": "Could not retrieve file structure"} except Exception as e: logger.warning("Error fetching repository contents: %s", str(e)) return {"status": "error", "message": str(e)} def _get_repository_releases(owner: str, repo_name: str) -> Dict: """Get repository releases from GitHub API.""" try: url = f"https://api.github.com/repos/{owner}/{repo_name}/releases" headers = { "Accept": "application/vnd.github.v3+json", "User-Agent": "Resumate-App/1.0" } response = requests.get(url, headers=headers, timeout=10) if response.status_code == 200: releases = response.json() # Extract key release information release_info = [] for release in releases[:10]: # Limit to 10 most recent release_info.append({ "tag_name": release.get("tag_name", ""), "name": release.get("name", ""), "published_at": release.get("published_at", ""), "prerelease": release.get("prerelease", False), "draft": release.get("draft", False) }) return {"status": "success", "data": release_info} return {"status": "error", "message": "Could not retrieve releases"} except Exception as e: logger.warning("Error fetching repository releases: %s", str(e)) return {"status": "error", "message": str(e)} def _get_repository_contributors(owner: str, repo_name: str) -> Dict: """Get repository contributors from GitHub API.""" try: url = f"https://api.github.com/repos/{owner}/{repo_name}/contributors" headers = { "Accept": "application/vnd.github.v3+json", "User-Agent": "Resumate-App/1.0" } response = requests.get(url, headers=headers, timeout=10) if response.status_code == 200: contributors = response.json() # Extract key contributor information contributor_info = [] for contributor in contributors[:20]: # Limit to top 20 contributors contributor_info.append({ "login": contributor.get("login", ""), "contributions": contributor.get("contributions", 0), "html_url": contributor.get("html_url", ""), "type": contributor.get("type", "") }) return {"status": "success", "data": contributor_info} return {"status": "error", "message": "Could not retrieve contributors"} except Exception as e: logger.warning("Error fetching repository contributors: %s", str(e)) return {"status": "error", "message": str(e)}