gperdrizet commited on
Commit
1084ca5
·
1 Parent(s): cbb592a

Updated tests for new PDF parsing function.

Browse files
functions/context_acquisition.py CHANGED
@@ -136,10 +136,6 @@ def _structure_resume_text(text: str) -> dict:
136
  "education": r"(?i)(education|academic|university|college|school)",
137
  "skills": r"(?i)(skills|competencies|technologies|technical)",
138
  "certifications": r"(?i)(certification|certificate|license)",
139
- "projects": r"(?i)(project|portfolio)",
140
- "achievements": r"(?i)(achievement|award|honor|recognition)",
141
- "languages": r"(?i)(language|linguistic)",
142
- "volunteer": r"(?i)(volunteer|community|charity)"
143
  }
144
 
145
  # Split text into lines for processing
 
136
  "education": r"(?i)(education|academic|university|college|school)",
137
  "skills": r"(?i)(skills|competencies|technologies|technical)",
138
  "certifications": r"(?i)(certification|certificate|license)",
 
 
 
 
139
  }
140
 
141
  # Split text into lines for processing
tests/test_context_acquisition.py CHANGED
@@ -1,3 +1,196 @@
1
  """
2
  Unit tests for the context_acquisition module.
3
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  """
2
  Unit tests for the context_acquisition module.
3
  """
4
+
5
+ import unittest
6
+ import tempfile
7
+ import os
8
+ from unittest.mock import patch, MagicMock
9
+ from functions import context_acquisition as ca
10
+
11
+
12
+ class TestCleanExtractedText(unittest.TestCase):
13
+ """Test cases for the _clean_extracted_text function."""
14
+
15
+ def test_normalize_multiple_newlines(self):
16
+ """Test normalization of multiple newlines."""
17
+ raw = "Line 1\n\nLine 2\n\n\nLine 3"
18
+ expected = "Line 1\nLine 2\nLine 3"
19
+ self.assertEqual(ca._clean_extracted_text(raw), expected)
20
+
21
+ def test_remove_artifacts(self):
22
+ """Test removal of PDF artifacts."""
23
+ raw = " 123 \n|---|\nSome text\n"
24
+ expected = "Some text"
25
+ self.assertEqual(ca._clean_extracted_text(raw), expected)
26
+
27
+ def test_normalize_spaces(self):
28
+ """Test normalization of multiple spaces."""
29
+ raw = "A B C"
30
+ expected = "A B C"
31
+ self.assertEqual(ca._clean_extracted_text(raw), expected)
32
+
33
+ def test_empty_string(self):
34
+ """Test handling of empty string."""
35
+ self.assertEqual(ca._clean_extracted_text(""), "")
36
+
37
+ def test_none_input(self):
38
+ """Test handling of None input."""
39
+ self.assertEqual(ca._clean_extracted_text(None), "")
40
+
41
+
42
+ class TestStructureResumeText(unittest.TestCase):
43
+ """Test cases for the _structure_resume_text function."""
44
+
45
+ def test_basic_structure(self):
46
+ """Test basic resume text structuring."""
47
+ text = "Contact Info\nJohn Doe\nSummary\nExperienced dev\nExperience\nCompany X\nEducation\nMIT\nSkills\nPython, C++"
48
+ result = ca._structure_resume_text(text)
49
+
50
+ self.assertIn("contact_info", result["sections"])
51
+ self.assertIn("summary", result["sections"])
52
+ self.assertIn("experience", result["sections"])
53
+ self.assertIn("education", result["sections"])
54
+ self.assertIn("skills", result["sections"])
55
+ self.assertGreater(result["word_count"], 0)
56
+ self.assertGreaterEqual(result["section_count"], 5)
57
+
58
+ def test_empty_text(self):
59
+ """Test handling of empty text."""
60
+ result = ca._structure_resume_text("")
61
+ self.assertEqual(result["sections"], {})
62
+ self.assertEqual(result["full_text"], "")
63
+ self.assertEqual(result["word_count"], 0)
64
+ self.assertEqual(result["section_count"], 0)
65
+
66
+ def test_contains_required_fields(self):
67
+ """Test that result contains all required fields."""
68
+ text = "Some basic text"
69
+ result = ca._structure_resume_text(text)
70
+
71
+ required_fields = ["sections", "full_text", "llm_formatted", "summary",
72
+ "format", "word_count", "section_count"]
73
+ for field in required_fields:
74
+ self.assertIn(field, result)
75
+
76
+
77
+ class TestFormatForLLM(unittest.TestCase):
78
+ """Test cases for the _format_for_llm function."""
79
+
80
+ def test_section_formatting(self):
81
+ """Test proper formatting of sections for LLM."""
82
+ sections = {
83
+ "summary": "A summary.",
84
+ "contact_info": "Contact details.",
85
+ "experience": "Work exp.",
86
+ "education": "School info.",
87
+ "skills": "Python, C++"
88
+ }
89
+ full_text = "..."
90
+ formatted = ca._format_for_llm(sections, full_text)
91
+
92
+ self.assertIn("[SUMMARY]", formatted)
93
+ self.assertIn("[CONTACT INFO]", formatted)
94
+ self.assertIn("[EXPERIENCE]", formatted)
95
+ self.assertIn("[EDUCATION]", formatted)
96
+ self.assertIn("[SKILLS]", formatted)
97
+ self.assertTrue(formatted.startswith("=== RESUME CONTENT ==="))
98
+ self.assertTrue(formatted.endswith("=== END RESUME ==="))
99
+
100
+ def test_empty_sections(self):
101
+ """Test handling of empty sections."""
102
+ sections = {}
103
+ full_text = "test"
104
+ formatted = ca._format_for_llm(sections, full_text)
105
+
106
+ self.assertTrue(formatted.startswith("=== RESUME CONTENT ==="))
107
+ self.assertTrue(formatted.endswith("=== END RESUME ==="))
108
+
109
+
110
+ class TestGetLLMContextFromResume(unittest.TestCase):
111
+ """Test cases for the get_llm_context_from_resume function."""
112
+
113
+ def test_success_with_llm_formatted(self):
114
+ """Test successful extraction with LLM formatted text."""
115
+ extraction_result = {
116
+ "status": "success",
117
+ "structured_text": {"llm_formatted": "LLM text", "full_text": "Full text"}
118
+ }
119
+ result = ca.get_llm_context_from_resume(extraction_result)
120
+ self.assertEqual(result, "LLM text")
121
+
122
+ def test_fallback_to_full_text(self):
123
+ """Test fallback to full text when LLM formatted not available."""
124
+ extraction_result = {
125
+ "status": "success",
126
+ "structured_text": {"full_text": "Full text"}
127
+ }
128
+ result = ca.get_llm_context_from_resume(extraction_result)
129
+ self.assertEqual(result, "Full text")
130
+
131
+ def test_error_status(self):
132
+ """Test handling of error status."""
133
+ extraction_result = {"status": "error"}
134
+ result = ca.get_llm_context_from_resume(extraction_result)
135
+ self.assertEqual(result, "")
136
+
137
+ def test_missing_structured_text(self):
138
+ """Test handling of missing structured_text."""
139
+ extraction_result = {"status": "success"}
140
+ result = ca.get_llm_context_from_resume(extraction_result)
141
+ self.assertEqual(result, "")
142
+
143
+
144
+ class TestExtractTextFromLinkedInPDF(unittest.TestCase):
145
+ """Test cases for the extract_text_from_linkedin_pdf function."""
146
+
147
+ def test_none_input(self):
148
+ """Test handling of None input."""
149
+ result = ca.extract_text_from_linkedin_pdf(None)
150
+ self.assertEqual(result["status"], "error")
151
+ self.assertIn("No PDF file provided", result["message"])
152
+
153
+ @patch('PyPDF2.PdfReader')
154
+ @patch('builtins.open')
155
+ def test_successful_extraction(self, mock_open, mock_pdf_reader):
156
+ """Test successful PDF text extraction with mocked PyPDF2."""
157
+ # Create a temporary file
158
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
159
+ tmp_path = tmp.name
160
+
161
+ try:
162
+ # Mock file reading
163
+ mock_file = MagicMock()
164
+ mock_file.read.return_value = b"fake pdf content"
165
+ mock_open.return_value.__enter__.return_value = mock_file
166
+
167
+ # Mock PDF reader and page
168
+ mock_page = MagicMock()
169
+ mock_page.extract_text.return_value = "Contact Info\nJohn Doe\nSummary\nDeveloper\nExperience\nCompany X"
170
+
171
+ mock_reader_instance = MagicMock()
172
+ mock_reader_instance.pages = [mock_page]
173
+ mock_pdf_reader.return_value = mock_reader_instance
174
+
175
+ # Test the function
176
+ result = ca.extract_text_from_linkedin_pdf(tmp_path)
177
+
178
+ self.assertEqual(result["status"], "success")
179
+ self.assertIn("structured_text", result)
180
+ self.assertIn("metadata", result)
181
+ self.assertIn("contact_info", result["structured_text"]["sections"])
182
+
183
+ finally:
184
+ # Clean up
185
+ if os.path.exists(tmp_path):
186
+ os.remove(tmp_path)
187
+
188
+ def test_nonexistent_file(self):
189
+ """Test handling of non-existent file."""
190
+ result = ca.extract_text_from_linkedin_pdf("/nonexistent/path.pdf")
191
+ self.assertEqual(result["status"], "error")
192
+ self.assertIn("Failed to extract text from PDF", result["message"])
193
+
194
+
195
+ if __name__ == '__main__':
196
+ unittest.main()