Spaces:

gperdrizet
/

resumate

Configuration error

App Files Files Community

gperdrizet commited on Jul 10

Commit

b9464fb

verified ·

1 Parent(s): bcdc087

General clean up, added save of results to linkedin resume parsing and github repo list retreival.

Browse files

Files changed (5) hide show

functions/github.py +85 -51
functions/gradio.py +46 -31
functions/job_call.py +7 -1
functions/linkedin_resume.py +87 -48
functions/writer_agent.py +14 -7

functions/github.py CHANGED Viewed

@@ -5,9 +5,12 @@ Functions for retrieving information from GitHub profiles and repositories.
 """
 import re
 import logging
-import requests
 from typing import List, Dict, Optional
 # Set up logging
 logging.basicConfig(level=logging.INFO)
@@ -49,29 +52,32 @@ def get_github_repositories(github_url: str) -> Dict:
     """
     if not github_url or not github_url.strip():
         return {"status": "error", "message": "No GitHub URL provided"}
     try:
         # Extract username from GitHub URL
         username = _extract_github_username(github_url)
         if not username:
             return {"status": "error", "message": "Invalid GitHub URL format"}
-        logger.info(f"Fetching repositories for GitHub user: {username}")
         # Get user info first
         user_info = _get_github_user_info(username)
         if user_info["status"] != "success":
             return user_info
         # Get repositories
         repositories = _get_user_repositories(username)
         if repositories["status"] != "success":
             return repositories
         # Process and structure repository data
         processed_repos = _process_repository_data(repositories["data"])
-        return {
             "status": "success",
             "repositories": processed_repos,
             "metadata": {
@@ -82,9 +88,25 @@ def get_github_repositories(github_url: str) -> Dict:
             },
             "message": f"Successfully retrieved {len(processed_repos)} repositories"
         }
-    except Exception as e:
-        logger.error(f"Error retrieving GitHub repositories: {str(e)}")
         return {
             "status": "error",
             "message": f"Failed to retrieve GitHub repositories: {str(e)}"
@@ -104,26 +126,29 @@ def _extract_github_username(github_url: str) -> Optional[str]:
     try:
         # Clean up the URL
         url = github_url.strip().rstrip('/')
         # Handle various GitHub URL formats
         patterns = [
             r'github\.com/([^/]+)/?$',  # https://github.com/username
             r'github\.com/([^/]+)/.*',  # https://github.com/username/anything
             r'^([a-zA-Z0-9\-_]+)$'     # Just username
         ]
         for pattern in patterns:
             match = re.search(pattern, url)
             if match:
                 username = match.group(1)
                 # Validate username format
                 if re.match(r'^[a-zA-Z0-9\-_]+$', username) and len(username) <= 39:
                     return username
         return None
-    except Exception as e:
-        logger.warning(f"Error extracting username from URL {github_url}: {str(e)}")
         return None
@@ -143,20 +168,23 @@ def _get_github_user_info(username: str) -> Dict:
             "Accept": "application/vnd.github.v3+json",
             "User-Agent": "Resumate-App/1.0"
         }
         response = requests.get(url, headers=headers, timeout=10)
         if response.status_code == 404:
             return {"status": "error", "message": f"GitHub user '{username}' not found"}
         elif response.status_code == 403:
             return {"status": "error", "message": "GitHub API rate limit exceeded"}
         elif response.status_code != 200:
             return {"status": "error", "message": f"GitHub API error: {response.status_code}"}
         return {"status": "success", "data": response.json()}
     except requests.RequestException as e:
-        logger.error(f"Network error fetching user info: {str(e)}")
         return {"status": "error", "message": f"Network error: {str(e)}"}
@@ -175,8 +203,9 @@ def _get_user_repositories(username: str) -> Dict:
         all_repos = []
         page = 1
         per_page = 100  # Maximum allowed by GitHub API
         while True:
             url = f"https://api.github.com/users/{username}/repos"
             params = {
                 "type": "public",
@@ -189,32 +218,34 @@ def _get_user_repositories(username: str) -> Dict:
                 "Accept": "application/vnd.github.v3+json",
                 "User-Agent": "Resumate-App/1.0"
             }
             response = requests.get(url, headers=headers, params=params, timeout=10)
             if response.status_code != 200:
                 return {"status": "error", "message": f"GitHub API error: {response.status_code}"}
             repos = response.json()
             if not repos:  # No more repositories
                 break
             all_repos.extend(repos)
             # If we got less than per_page, we've reached the end
             if len(repos) < per_page:
                 break
             page += 1
             # Safety limit to prevent infinite loops
             if page > 10:  # Max 1000 repos
                 break
         return {"status": "success", "data": all_repos}
     except requests.RequestException as e:
-        logger.error(f"Network error fetching repositories: {str(e)}")
         return {"status": "error", "message": f"Network error: {str(e)}"}
@@ -229,12 +260,13 @@ def _process_repository_data(repos: List[Dict]) -> List[Dict]:
         List[Dict]: Processed repository data
     """
     processed = []
     for repo in repos:
         # Skip forks unless they have significant modifications
         if repo.get("fork", False) and repo.get("stargazers_count", 0) == 0:
             continue
         processed_repo = {
             "name": repo.get("name", ""),
             "description": repo.get("description", ""),
@@ -252,9 +284,9 @@ def _process_repository_data(repos: List[Dict]) -> List[Dict]:
             "has_wiki": repo.get("has_wiki", False),
             "has_pages": repo.get("has_pages", False)
         }
         processed.append(processed_repo)
     return processed
@@ -268,49 +300,51 @@ def format_repositories_for_llm(github_result: Dict) -> str:
     Returns:
         str: Formatted text ready for LLM context
     """
     if github_result.get("status") != "success":
-        return f"GitHub repositories could not be retrieved: {github_result.get('message', 'Unknown error')}"
     repositories = github_result.get("repositories", [])
     metadata = github_result.get("metadata", {})
     if not repositories:
         return f"No public repositories found for {metadata.get('username', 'user')}"
     formatted_parts = [
         "=== GITHUB REPOSITORIES ===\n",
         f"Profile: {metadata.get('profile_url', 'N/A')}",
         f"Username: {metadata.get('username', 'N/A')}",
         f"Public Repositories: {len(repositories)}\n"
     ]
     for i, repo in enumerate(repositories[:20], 1):  # Limit to top 20 repos
         repo_info = [
             f"[REPOSITORY {i}]",
             f"Name: {repo['name']}",
             f"URL: {repo['html_url']}"
         ]
         if repo['description']:
             repo_info.append(f"Description: {repo['description']}")
         if repo['language']:
             repo_info.append(f"Primary Language: {repo['language']}")
         if repo['topics']:
             repo_info.append(f"Topics: {', '.join(repo['topics'][:5])}")  # Limit topics
         repo_info.extend([
             f"Stars: {repo['stars']} | Forks: {repo['forks']}",
             f"Last Updated: {repo['updated_at'][:10]}",  # Just the date
             ""  # Empty line between repositories
         ])
         formatted_parts.extend(repo_info)
     if len(repositories) > 20:
         formatted_parts.append(f"... and {len(repositories) - 20} more repositories")
     formatted_parts.append("\n=== END GITHUB REPOSITORIES ===")
     return '\n'.join(formatted_parts)

 """
 import re
+import json
 import logging
 from typing import List, Dict, Optional
+from pathlib import Path
+import requests
 # Set up logging
 logging.basicConfig(level=logging.INFO)
     """
     if not github_url or not github_url.strip():
         return {"status": "error", "message": "No GitHub URL provided"}
     try:
         # Extract username from GitHub URL
         username = _extract_github_username(github_url)
         if not username:
             return {"status": "error", "message": "Invalid GitHub URL format"}
+        logger.info("Fetching repositories for GitHub user: %s", username)
         # Get user info first
         user_info = _get_github_user_info(username)
         if user_info["status"] != "success":
             return user_info
         # Get repositories
         repositories = _get_user_repositories(username)
         if repositories["status"] != "success":
             return repositories
         # Process and structure repository data
         processed_repos = _process_repository_data(repositories["data"])
+        result = {
             "status": "success",
             "repositories": processed_repos,
             "metadata": {
             },
             "message": f"Successfully retrieved {len(processed_repos)} repositories"
         }
+        # Save results to JSON file
+        try:
+            data_dir = Path(__file__).parent.parent / "data"
+            data_dir.mkdir(exist_ok=True)
+            output_file = data_dir / "github_repos.json"
+            with open(output_file, 'w', encoding='utf-8') as f:
+                json.dump(result, f, indent=2, ensure_ascii=False)
+            logger.info("GitHub repositories saved to %s", output_file)
+        except Exception as save_error: # pylint: disable=broad-exception-caught
+            logger.warning("Failed to save GitHub repositories to file: %s", str(save_error))
+        return result
+    except Exception as e: # pylint: disable=broad-exception-caught
+        logger.error("Error retrieving GitHub repositories: %s", str(e))
         return {
             "status": "error",
             "message": f"Failed to retrieve GitHub repositories: {str(e)}"
     try:
         # Clean up the URL
         url = github_url.strip().rstrip('/')
         # Handle various GitHub URL formats
         patterns = [
             r'github\.com/([^/]+)/?$',  # https://github.com/username
             r'github\.com/([^/]+)/.*',  # https://github.com/username/anything
             r'^([a-zA-Z0-9\-_]+)$'     # Just username
         ]
         for pattern in patterns:
             match = re.search(pattern, url)
             if match:
                 username = match.group(1)
                 # Validate username format
                 if re.match(r'^[a-zA-Z0-9\-_]+$', username) and len(username) <= 39:
                     return username
         return None
+    except Exception as e: # pylint: disable=broad-exception-caught
+        logger.warning("Error extracting username from URL %s: %s", github_url, str(e))
         return None
             "Accept": "application/vnd.github.v3+json",
             "User-Agent": "Resumate-App/1.0"
         }
         response = requests.get(url, headers=headers, timeout=10)
         if response.status_code == 404:
             return {"status": "error", "message": f"GitHub user '{username}' not found"}
         elif response.status_code == 403:
             return {"status": "error", "message": "GitHub API rate limit exceeded"}
         elif response.status_code != 200:
             return {"status": "error", "message": f"GitHub API error: {response.status_code}"}
         return {"status": "success", "data": response.json()}
     except requests.RequestException as e:
+        logger.error("Network error fetching user info: %s", str(e))
         return {"status": "error", "message": f"Network error: {str(e)}"}
         all_repos = []
         page = 1
         per_page = 100  # Maximum allowed by GitHub API
         while True:
             url = f"https://api.github.com/users/{username}/repos"
             params = {
                 "type": "public",
                 "Accept": "application/vnd.github.v3+json",
                 "User-Agent": "Resumate-App/1.0"
             }
             response = requests.get(url, headers=headers, params=params, timeout=10)
             if response.status_code != 200:
                 return {"status": "error", "message": f"GitHub API error: {response.status_code}"}
             repos = response.json()
             if not repos:  # No more repositories
                 break
             all_repos.extend(repos)
             # If we got less than per_page, we've reached the end
             if len(repos) < per_page:
                 break
             page += 1
             # Safety limit to prevent infinite loops
             if page > 10:  # Max 1000 repos
                 break
         return {"status": "success", "data": all_repos}
     except requests.RequestException as e:
+        logger.error("Network error fetching repositories: %s", str(e))
         return {"status": "error", "message": f"Network error: {str(e)}"}
         List[Dict]: Processed repository data
     """
     processed = []
     for repo in repos:
         # Skip forks unless they have significant modifications
         if repo.get("fork", False) and repo.get("stargazers_count", 0) == 0:
             continue
         processed_repo = {
             "name": repo.get("name", ""),
             "description": repo.get("description", ""),
             "has_wiki": repo.get("has_wiki", False),
             "has_pages": repo.get("has_pages", False)
         }
         processed.append(processed_repo)
     return processed
     Returns:
         str: Formatted text ready for LLM context
     """
     if github_result.get("status") != "success":
+        return "GitHub repositories could not be retrieved: " + \
+            f"{github_result.get('message', 'Unknown error')}"
     repositories = github_result.get("repositories", [])
     metadata = github_result.get("metadata", {})
     if not repositories:
         return f"No public repositories found for {metadata.get('username', 'user')}"
     formatted_parts = [
         "=== GITHUB REPOSITORIES ===\n",
         f"Profile: {metadata.get('profile_url', 'N/A')}",
         f"Username: {metadata.get('username', 'N/A')}",
         f"Public Repositories: {len(repositories)}\n"
     ]
     for i, repo in enumerate(repositories[:20], 1):  # Limit to top 20 repos
         repo_info = [
             f"[REPOSITORY {i}]",
             f"Name: {repo['name']}",
             f"URL: {repo['html_url']}"
         ]
         if repo['description']:
             repo_info.append(f"Description: {repo['description']}")
         if repo['language']:
             repo_info.append(f"Primary Language: {repo['language']}")
         if repo['topics']:
             repo_info.append(f"Topics: {', '.join(repo['topics'][:5])}")  # Limit topics
         repo_info.extend([
             f"Stars: {repo['stars']} | Forks: {repo['forks']}",
             f"Last Updated: {repo['updated_at'][:10]}",  # Just the date
             ""  # Empty line between repositories
         ])
         formatted_parts.extend(repo_info)
     if len(repositories) > 20:
         formatted_parts.append(f"... and {len(repositories) - 20} more repositories")
     formatted_parts.append("\n=== END GITHUB REPOSITORIES ===")
     return '\n'.join(formatted_parts)

functions/gradio.py CHANGED Viewed

@@ -31,55 +31,61 @@ def process_inputs(linkedin_pdf, github_url, job_post_text, user_instructions):
     result = ""
     extraction_result = None
     logger.info("Processing user inputs from Gradio interface")
     # Process LinkedIn PDF file
     if linkedin_pdf is not None:
         result += "✅ LinkedIn Resume PDF uploaded\n"
-        logger.info(f"Processing LinkedIn PDF: {linkedin_pdf.name}")
         # Extract and structure text from the PDF
         extraction_result = extract_text_from_linkedin_pdf(linkedin_pdf.name)
         if extraction_result["status"] == "success":
             result += " ✅ Text extraction successful\n\n"
             logger.info("LinkedIn PDF text extraction successful")
         elif extraction_result["status"] == "warning":
             result += f" ⚠️  Text extraction: {extraction_result['message']}\n\n"
-            logger.warning(f"LinkedIn PDF extraction warning: {extraction_result['message']}")
         else:
             result += f" ❌ Text extraction failed: {extraction_result['message']}\n\n"
-            logger.error(f"LinkedIn PDF extraction failed: {extraction_result['message']}")
     else:
         result += "❌ No LinkedIn resume PDF file uploaded\n\n"
         logger.info("No LinkedIn PDF file provided")
     # Process GitHub profile
     if github_url and github_url.strip():
         result += "✅ GitHub Profile URL provided\n"
-        logger.info(f"Processing GitHub URL: {github_url}")
         # Retrieve repositories from GitHub
         github_result = get_github_repositories(github_url)
         if github_result["status"] == "success":
             result += "  ✅ GitHub list download successful\n\n"
-            logger.info(f"GitHub repositories retrieved successfully for {github_result['metadata']['username']}")
         else:
             result += f"  ❌ GitHub extraction failed: {github_result['message']}\n\n"
-            logger.error(f"GitHub extraction failed: {github_result['message']}")
     else:
         result += "❌ No GitHub profile URL provided\n\n"
         logger.info("No GitHub URL provided")
     # Process job post text
     if job_post_text and job_post_text.strip():
         result += "✅ Job post text provided\n"
-        logger.info(f"Job post text provided ({len(job_post_text)} characters)")
         summary = summarize_job_call(job_post_text)
-        result += "  ✅ Job post summary generated\n"
         result += summary
     else:
         result += "❌ Job post not provided\n"
         logger.info("No job post text provided")
@@ -87,7 +93,7 @@ def process_inputs(linkedin_pdf, github_url, job_post_text, user_instructions):
     # Process user instructions
     if user_instructions and user_instructions.strip():
         result += "✅ Additional instructions provided\n"
-        logger.info(f"User instructions provided ({len(user_instructions)} characters)")
     else:
         result += "ℹ️ No additional instructions provided\n"
         logger.info("No additional instructions provided")
@@ -97,21 +103,22 @@ def process_inputs(linkedin_pdf, github_url, job_post_text, user_instructions):
     # Generate resume only if we have valid extraction result
     if extraction_result and extraction_result.get("status") == "success":
         try:
-            resume = write_resume(extraction_result, user_instructions)
-            result += f"\n✅ Resume generated successfully\n"
             logger.info("Resume generation completed successfully")
-        except Exception as e:
             result += f"\n❌ Resume generation failed: {str(e)}\n"
-            logger.error(f"Resume generation failed: {str(e)}")
     else:
-        result += f"\n❌ Cannot generate resume: No valid LinkedIn data extracted\n"
         result += "Please ensure you upload a valid LinkedIn PDF export file.\n"
         logger.warning("Resume generation skipped - no valid LinkedIn data available")
     return result
-def get_processed_data(linkedin_pdf, github_url, job_post_text, user_instructions):
     """
     Get structured data from all inputs for further processing.
@@ -119,33 +126,41 @@ def get_processed_data(linkedin_pdf, github_url, job_post_text, user_instruction
         linkedin_pdf: Uploaded LinkedIn resume export PDF file
         github_url (str): GitHub profile URL
         job_post_text (str): Job post text content
-        user_instructions (str): Additional instructions from the user
     Returns:
         dict: Structured data containing all processed information
     """
     processed_data = {
         "linkedin": None,
         "github": None,
-        "job_post": job_post_text.strip() if job_post_text and job_post_text.strip() else None,
-        "user_instructions": user_instructions.strip() if user_instructions and user_instructions.strip() else None,
         "errors": []
     }
     # Process LinkedIn PDF
     if linkedin_pdf is not None:
         extraction_result = extract_text_from_linkedin_pdf(linkedin_pdf.name)
         if extraction_result["status"] == "success":
             processed_data["linkedin"] = extraction_result
         else:
             processed_data["errors"].append(f"LinkedIn: {extraction_result['message']}")
     # Process GitHub profile
     if github_url and github_url.strip():
         github_result = get_github_repositories(github_url)
         if github_result["status"] == "success":
             processed_data["github"] = github_result
         else:
             processed_data["errors"].append(f"GitHub: {github_result['message']}")
     return processed_data

     result = ""
     extraction_result = None
     logger.info("Processing user inputs from Gradio interface")
     # Process LinkedIn PDF file
     if linkedin_pdf is not None:
         result += "✅ LinkedIn Resume PDF uploaded\n"
+        logger.info("Processing LinkedIn PDF: %s", linkedin_pdf.name)
         # Extract and structure text from the PDF
         extraction_result = extract_text_from_linkedin_pdf(linkedin_pdf.name)
         if extraction_result["status"] == "success":
             result += " ✅ Text extraction successful\n\n"
             logger.info("LinkedIn PDF text extraction successful")
         elif extraction_result["status"] == "warning":
             result += f" ⚠️  Text extraction: {extraction_result['message']}\n\n"
+            logger.warning("LinkedIn PDF extraction warning: %s", extraction_result['message'])
         else:
             result += f" ❌ Text extraction failed: {extraction_result['message']}\n\n"
+            logger.error("LinkedIn PDF extraction failed: %s", extraction_result['message'])
     else:
         result += "❌ No LinkedIn resume PDF file uploaded\n\n"
         logger.info("No LinkedIn PDF file provided")
     # Process GitHub profile
     if github_url and github_url.strip():
         result += "✅ GitHub Profile URL provided\n"
+        logger.info("Processing GitHub URL: %s", github_url)
         # Retrieve repositories from GitHub
         github_result = get_github_repositories(github_url)
         if github_result["status"] == "success":
             result += "  ✅ GitHub list download successful\n\n"
+            logger.info(
+                "GitHub repositories retrieved successfully for %s",
+                github_result['metadata']['username']
+            )
         else:
             result += f"  ❌ GitHub extraction failed: {github_result['message']}\n\n"
+            logger.error("GitHub extraction failed: %s", github_result['message'])
     else:
         result += "❌ No GitHub profile URL provided\n\n"
         logger.info("No GitHub URL provided")
     # Process job post text
     if job_post_text and job_post_text.strip():
         result += "✅ Job post text provided\n"
+        logger.info("Job post text provided (%d characters)", len(job_post_text))
         summary = summarize_job_call(job_post_text)
         result += summary
+        result += "  ✅ Job post summary generated\n"
+        logger.info("Job post summary generated (%d characters)", len(summary))
     else:
         result += "❌ Job post not provided\n"
         logger.info("No job post text provided")
     # Process user instructions
     if user_instructions and user_instructions.strip():
         result += "✅ Additional instructions provided\n"
+        logger.info("User instructions provided (%d characters)", len(user_instructions))
     else:
         result += "ℹ️ No additional instructions provided\n"
         logger.info("No additional instructions provided")
     # Generate resume only if we have valid extraction result
     if extraction_result and extraction_result.get("status") == "success":
         try:
+            _ = write_resume(extraction_result, user_instructions)
+            result += "\n✅ Resume generated successfully\n"
             logger.info("Resume generation completed successfully")
+        except Exception as e: # pylint: disable=broad-exception-caught
             result += f"\n❌ Resume generation failed: {str(e)}\n"
+            logger.error("Resume generation failed: %s", str(e))
     else:
+        result += "\n❌ Cannot generate resume: No valid LinkedIn data extracted\n"
         result += "Please ensure you upload a valid LinkedIn PDF export file.\n"
         logger.warning("Resume generation skipped - no valid LinkedIn data available")
     return result
+def get_processed_data(linkedin_pdf, github_url, job_post_text, instructions):
     """
     Get structured data from all inputs for further processing.
         linkedin_pdf: Uploaded LinkedIn resume export PDF file
         github_url (str): GitHub profile URL
         job_post_text (str): Job post text content
+        instructions (str): Additional instructions from the user
     Returns:
         dict: Structured data containing all processed information
     """
+    job_post_text = job_post_text.strip() if job_post_text and job_post_text.strip() else None
+    instructions = instructions.strip() if instructions and instructions.strip() else None
     processed_data = {
         "linkedin": None,
         "github": None,
+        "job_post": job_post_text,
+        "user_instructions": instructions,
         "errors": []
     }
     # Process LinkedIn PDF
     if linkedin_pdf is not None:
         extraction_result = extract_text_from_linkedin_pdf(linkedin_pdf.name)
         if extraction_result["status"] == "success":
             processed_data["linkedin"] = extraction_result
         else:
             processed_data["errors"].append(f"LinkedIn: {extraction_result['message']}")
     # Process GitHub profile
     if github_url and github_url.strip():
         github_result = get_github_repositories(github_url)
         if github_result["status"] == "success":
             processed_data["github"] = github_result
         else:
             processed_data["errors"].append(f"GitHub: {github_result['message']}")
     return processed_data

functions/job_call.py CHANGED Viewed

@@ -1,9 +1,15 @@
 '''Functions for summarizing and formatting job calls.'''
 import os
 from openai import OpenAI
 from configuration import JOB_CALL_EXTRACTION_PROMPT
 def summarize_job_call(job_call: str) -> str:
     '''Extracts and summarizes key information from job call.'''
@@ -51,4 +57,4 @@ def summarize_job_call(job_call: str) -> str:
     else:
         summary = None
-    return summary

 '''Functions for summarizing and formatting job calls.'''
 import os
+import logging
 from openai import OpenAI
 from configuration import JOB_CALL_EXTRACTION_PROMPT
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
 def summarize_job_call(job_call: str) -> str:
     '''Extracts and summarizes key information from job call.'''
     else:
         summary = None
+    return summary

functions/linkedin_resume.py CHANGED Viewed

@@ -9,6 +9,8 @@ import re
 import logging
 import io
 import os
 import PyPDF2
 # Set up logging
@@ -41,35 +43,37 @@ def extract_text_from_linkedin_pdf(pdf_file) -> dict:
     """
     if pdf_file is None:
         return {"status": "error", "message": "No PDF file provided"}
     try:
         # Get filename from path
         filename = os.path.basename(pdf_file)
         # Read the PDF file from the file path
         with open(pdf_file, 'rb') as file:
             file_content = file.read()
             file_size = len(file_content)
         # Create PDF reader from the file content
         pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_content))
         # Extract text from all pages
         extracted_text = ""
         num_pages = len(pdf_reader.pages)
         for page_num in range(num_pages):
             try:
                 page = pdf_reader.pages[page_num]
                 page_text = page.extract_text()
                 extracted_text += page_text + "\n\n"
-            except Exception as e:
-                logger.warning(f"Error extracting text from page {page_num + 1}: {str(e)}")
                 continue
         # Clean and structure the extracted text for LLM consumption
         structured_content = _structure_resume_text(extracted_text)
         if not structured_content["full_text"].strip():
             return {
                 "status": "warning",
@@ -81,10 +85,14 @@ def extract_text_from_linkedin_pdf(pdf_file) -> dict:
                 },
                 "message": "PDF processed but no text content was extracted"
             }
-        logger.info(f"Successfully extracted and structured {len(structured_content['full_text'])} characters from {filename}")
-        return {
             "status": "success",
             "structured_text": structured_content,
             "metadata": {
@@ -95,9 +103,26 @@ def extract_text_from_linkedin_pdf(pdf_file) -> dict:
             },
             "message": f"Text extracted and structured successfully from {num_pages} pages"
         }
-    except Exception as e:
-        logger.error(f"Error processing PDF file: {str(e)}")
         return {
             "status": "error",
             "message": f"Failed to extract text from PDF: {str(e)}"
@@ -124,10 +149,10 @@ def _structure_resume_text(text: str) -> dict:
             "word_count": 0,
             "section_count": 0
         }
     # Clean the text first
     cleaned_text = _clean_extracted_text(text)
     # Define section patterns (common LinkedIn export sections)
     section_patterns = {
         "contact_info": r"(?i)(contact|personal|profile)\s*(?:information)?",
@@ -137,56 +162,66 @@ def _structure_resume_text(text: str) -> dict:
         "skills": r"(?i)(skills|competencies|technologies|technical)",
         "certifications": r"(?i)(certification|certificate|license)",
     }
     # Split text into lines for processing
     lines = cleaned_text.split('\n')
     sections = {}
     current_section = "general"
     current_content = []
     for line in lines:
         line = line.strip()
         if not line:
             continue
         # Check if line is a section header
         section_found = None
         for section_name, pattern in section_patterns.items():
             if re.match(pattern, line):
                 section_found = section_name
                 break
         if section_found:
             # Save previous section content
             if current_content:
                 sections[current_section] = '\n'.join(current_content)
             # Start new section
             current_section = section_found
             current_content = [line]
         else:
             current_content.append(line)
     # Save the last section
     if current_content:
         sections[current_section] = '\n'.join(current_content)
     # Create a structured summary for LLM context
     summary_parts = []
     if "contact_info" in sections:
         summary_parts.append(f"CONTACT: {sections['contact_info'][:200]}...")
     if "summary" in sections:
         summary_parts.append(f"SUMMARY: {sections['summary']}")
     if "experience" in sections:
         summary_parts.append(f"EXPERIENCE: {sections['experience'][:300]}...")
     if "education" in sections:
         summary_parts.append(f"EDUCATION: {sections['education']}")
     if "skills" in sections:
         summary_parts.append(f"SKILLS: {sections['skills']}")
     # Create LLM-optimized format
-    llm_formatted_text = _format_for_llm(sections, cleaned_text)
     return {
         "sections": sections,
         "full_text": cleaned_text,
@@ -198,7 +233,7 @@ def _structure_resume_text(text: str) -> dict:
     }
-def _format_for_llm(sections: dict, full_text: str) -> str:
     """
     Format the resume sections in an optimal way for LLM processing.
@@ -210,32 +245,35 @@ def _format_for_llm(sections: dict, full_text: str) -> str:
         str: LLM-optimized formatted text
     """
     formatted_parts = ["=== RESUME CONTENT ===\n"]
     # Prioritize sections in logical order for LLM
-    priority_order = ["summary", "contact_info", "experience", "education", "skills",
                      "certifications", "projects", "achievements", "languages", "volunteer"]
     # Add prioritized sections
     for section_name in priority_order:
         if section_name in sections:
             formatted_parts.append(f"[{section_name.upper().replace('_', ' ')}]")
             formatted_parts.append(sections[section_name])
             formatted_parts.append("")  # Empty line between sections
     # Add any remaining sections
     for section_name, content in sections.items():
         if section_name not in priority_order and section_name != "general":
             formatted_parts.append(f"[{section_name.upper().replace('_', ' ')}]")
             formatted_parts.append(content)
             formatted_parts.append("")
     # Add general content if exists
     if "general" in sections:
         formatted_parts.append("[ADDITIONAL INFORMATION]")
         formatted_parts.append(sections["general"])
     formatted_parts.append("\n=== END RESUME ===")
     return '\n'.join(formatted_parts)
@@ -251,39 +289,40 @@ def _clean_extracted_text(text: str) -> str:
     """
     if not text:
         return ""
     # Remove excessive whitespace and normalize line endings
     text = re.sub(r'\r\n', '\n', text)
     text = re.sub(r'\r', '\n', text)
     # Split into lines and clean each line
     lines = text.split('\n')
     cleaned_lines = []
     for line in lines:
         # Strip whitespace
         cleaned_line = line.strip()
         # Skip empty lines and very short lines (likely artifacts)
         if len(cleaned_line) < 2:
             continue
         # Remove common PDF artifacts
         cleaned_line = re.sub(r'^\d+$', '', cleaned_line)  # Page numbers
         cleaned_line = re.sub(r'^[|\-_=]+$', '', cleaned_line)  # Separator lines
         if cleaned_line:
             cleaned_lines.append(cleaned_line)
     # Join lines and normalize spacing
     cleaned_text = '\n'.join(cleaned_lines)
     # Normalize multiple spaces to single spaces
     cleaned_text = re.sub(r' +', ' ', cleaned_text)
     # Normalize multiple newlines to maximum of 2
     cleaned_text = re.sub(r'\n{3,}', '\n\n', cleaned_text)
     return cleaned_text.strip()
@@ -299,8 +338,8 @@ def get_llm_context_from_resume(extraction_result: dict) -> str:
     """
     if extraction_result.get("status") != "success":
         return ""
     structured_text = extraction_result.get("structured_text", {})
     # Return the LLM-formatted version if available, otherwise fall back to full text
     return structured_text.get("llm_formatted", structured_text.get("full_text", ""))

 import logging
 import io
 import os
+import json
+from pathlib import Path
 import PyPDF2
 # Set up logging
     """
     if pdf_file is None:
         return {"status": "error", "message": "No PDF file provided"}
     try:
         # Get filename from path
         filename = os.path.basename(pdf_file)
         # Read the PDF file from the file path
         with open(pdf_file, 'rb') as file:
             file_content = file.read()
             file_size = len(file_content)
         # Create PDF reader from the file content
         pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_content))
         # Extract text from all pages
         extracted_text = ""
         num_pages = len(pdf_reader.pages)
         for page_num in range(num_pages):
             try:
                 page = pdf_reader.pages[page_num]
                 page_text = page.extract_text()
                 extracted_text += page_text + "\n\n"
+            except Exception as e: # pylint: disable=broad-exception-caught
+                logger.warning("Error extracting text from page %d: %s", page_num + 1, str(e))
                 continue
         # Clean and structure the extracted text for LLM consumption
         structured_content = _structure_resume_text(extracted_text)
         if not structured_content["full_text"].strip():
             return {
                 "status": "warning",
                 },
                 "message": "PDF processed but no text content was extracted"
             }
+        logger.info(
+            "Successfully extracted and structured %d characters from %s",
+            len(structured_content['full_text']),
+            filename
+        )
+        result = {
             "status": "success",
             "structured_text": structured_content,
             "metadata": {
             },
             "message": f"Text extracted and structured successfully from {num_pages} pages"
         }
+        # Save results to JSON file
+        try:
+            data_dir = Path(__file__).parent.parent / "data"
+            data_dir.mkdir(exist_ok=True)
+            output_file = data_dir / "linkedin_resume.json"
+            with open(output_file, 'w', encoding='utf-8') as f:
+                json.dump(result, f, indent=2, ensure_ascii=False)
+            logger.info("LinkedIn resume extraction saved to %s", output_file)
+        except Exception as save_error: # pylint: disable=broad-exception-caught
+            logger.warning("Failed to save LinkedIn resume extraction to file: %s", str(save_error))
+        return result
+    except Exception as e: # pylint: disable=broad-exception-caught
+        logger.error("Error processing PDF file: %s", str(e))
         return {
             "status": "error",
             "message": f"Failed to extract text from PDF: {str(e)}"
             "word_count": 0,
             "section_count": 0
         }
     # Clean the text first
     cleaned_text = _clean_extracted_text(text)
     # Define section patterns (common LinkedIn export sections)
     section_patterns = {
         "contact_info": r"(?i)(contact|personal|profile)\s*(?:information)?",
         "skills": r"(?i)(skills|competencies|technologies|technical)",
         "certifications": r"(?i)(certification|certificate|license)",
     }
     # Split text into lines for processing
     lines = cleaned_text.split('\n')
     sections = {}
     current_section = "general"
     current_content = []
     for line in lines:
         line = line.strip()
         if not line:
             continue
         # Check if line is a section header
         section_found = None
         for section_name, pattern in section_patterns.items():
             if re.match(pattern, line):
                 section_found = section_name
                 break
         if section_found:
             # Save previous section content
             if current_content:
                 sections[current_section] = '\n'.join(current_content)
             # Start new section
             current_section = section_found
             current_content = [line]
         else:
             current_content.append(line)
     # Save the last section
     if current_content:
         sections[current_section] = '\n'.join(current_content)
     # Create a structured summary for LLM context
     summary_parts = []
     if "contact_info" in sections:
         summary_parts.append(f"CONTACT: {sections['contact_info'][:200]}...")
     if "summary" in sections:
         summary_parts.append(f"SUMMARY: {sections['summary']}")
     if "experience" in sections:
         summary_parts.append(f"EXPERIENCE: {sections['experience'][:300]}...")
     if "education" in sections:
         summary_parts.append(f"EDUCATION: {sections['education']}")
     if "skills" in sections:
         summary_parts.append(f"SKILLS: {sections['skills']}")
     # Create LLM-optimized format
+    llm_formatted_text = _format_for_llm(sections)
     return {
         "sections": sections,
         "full_text": cleaned_text,
     }
+def _format_for_llm(sections: dict) -> str:
     """
     Format the resume sections in an optimal way for LLM processing.
         str: LLM-optimized formatted text
     """
     formatted_parts = ["=== RESUME CONTENT ===\n"]
     # Prioritize sections in logical order for LLM
+    priority_order = ["summary", "contact_info", "experience", "education", "skills",
                      "certifications", "projects", "achievements", "languages", "volunteer"]
     # Add prioritized sections
     for section_name in priority_order:
         if section_name in sections:
             formatted_parts.append(f"[{section_name.upper().replace('_', ' ')}]")
             formatted_parts.append(sections[section_name])
             formatted_parts.append("")  # Empty line between sections
     # Add any remaining sections
     for section_name, content in sections.items():
         if section_name not in priority_order and section_name != "general":
             formatted_parts.append(f"[{section_name.upper().replace('_', ' ')}]")
             formatted_parts.append(content)
             formatted_parts.append("")
     # Add general content if exists
     if "general" in sections:
         formatted_parts.append("[ADDITIONAL INFORMATION]")
         formatted_parts.append(sections["general"])
     formatted_parts.append("\n=== END RESUME ===")
     return '\n'.join(formatted_parts)
     """
     if not text:
         return ""
     # Remove excessive whitespace and normalize line endings
     text = re.sub(r'\r\n', '\n', text)
     text = re.sub(r'\r', '\n', text)
     # Split into lines and clean each line
     lines = text.split('\n')
     cleaned_lines = []
     for line in lines:
         # Strip whitespace
         cleaned_line = line.strip()
         # Skip empty lines and very short lines (likely artifacts)
         if len(cleaned_line) < 2:
             continue
         # Remove common PDF artifacts
         cleaned_line = re.sub(r'^\d+$', '', cleaned_line)  # Page numbers
         cleaned_line = re.sub(r'^[|\-_=]+$', '', cleaned_line)  # Separator lines
         if cleaned_line:
             cleaned_lines.append(cleaned_line)
     # Join lines and normalize spacing
     cleaned_text = '\n'.join(cleaned_lines)
     # Normalize multiple spaces to single spaces
     cleaned_text = re.sub(r' +', ' ', cleaned_text)
     # Normalize multiple newlines to maximum of 2
     cleaned_text = re.sub(r'\n{3,}', '\n\n', cleaned_text)
     return cleaned_text.strip()
     """
     if extraction_result.get("status") != "success":
         return ""
     structured_text = extraction_result.get("structured_text", {})
     # Return the LLM-formatted version if available, otherwise fall back to full text
     return structured_text.get("llm_formatted", structured_text.get("full_text", ""))

functions/writer_agent.py CHANGED Viewed

@@ -23,7 +23,7 @@ def write_resume(content: str, user_instructions: str = None) -> str:
     """
     if content['status'] == 'success':
         agent = CodeAgent(
             model=AGENT_MODEL,
             tools=[],
@@ -36,7 +36,9 @@ def write_resume(content: str, user_instructions: str = None) -> str:
         # Prepare instructions - combine default with user instructions
         instructions = INSTRUCTIONS
         if user_instructions and user_instructions.strip():
             instructions += f"\n\nAdditional user instructions:\n{user_instructions.strip()}"
             logger.info("Added user instructions to agent prompt")
@@ -44,21 +46,26 @@ def write_resume(content: str, user_instructions: str = None) -> str:
             instructions + '\n' + json.dumps(content['structured_text']),
         )
-        logger.info("submitted_answer: %s", submitted_answer)
         # Create data directory if it doesn't exist
         data_dir = 'data'
         if not os.path.exists(data_dir):
             os.makedirs(data_dir)
             logger.info("Created data directory: %s", data_dir)
         # Save the resume to resume.md in the data directory
         resume_file_path = os.path.join(data_dir, 'resume.md')
         try:
             with open(resume_file_path, 'w', encoding='utf-8') as f:
                 f.write(submitted_answer)
             logger.info("Resume saved to: %s", resume_file_path)
-        except Exception as e:
             logger.error("Failed to save resume to file: %s", e)
-    return submitted_answer

     """
     if content['status'] == 'success':
         agent = CodeAgent(
             model=AGENT_MODEL,
             tools=[],
         # Prepare instructions - combine default with user instructions
         instructions = INSTRUCTIONS
         if user_instructions and user_instructions.strip():
             instructions += f"\n\nAdditional user instructions:\n{user_instructions.strip()}"
             logger.info("Added user instructions to agent prompt")
             instructions + '\n' + json.dumps(content['structured_text']),
         )
+        logger.info("submitted_answer: %s", submitted_answer)
         # Create data directory if it doesn't exist
         data_dir = 'data'
         if not os.path.exists(data_dir):
             os.makedirs(data_dir)
             logger.info("Created data directory: %s", data_dir)
         # Save the resume to resume.md in the data directory
         resume_file_path = os.path.join(data_dir, 'resume.md')
         try:
             with open(resume_file_path, 'w', encoding='utf-8') as f:
                 f.write(submitted_answer)
             logger.info("Resume saved to: %s", resume_file_path)
+        except Exception as e: # pylint: disable=broad-exception-caught
             logger.error("Failed to save resume to file: %s", e)
+    return submitted_answer