Spaces:
Configuration error
Configuration error
Commit
·
1084ca5
1
Parent(s):
cbb592a
Updated tests for new PDF parsing function.
Browse files
functions/context_acquisition.py
CHANGED
@@ -136,10 +136,6 @@ def _structure_resume_text(text: str) -> dict:
|
|
136 |
"education": r"(?i)(education|academic|university|college|school)",
|
137 |
"skills": r"(?i)(skills|competencies|technologies|technical)",
|
138 |
"certifications": r"(?i)(certification|certificate|license)",
|
139 |
-
"projects": r"(?i)(project|portfolio)",
|
140 |
-
"achievements": r"(?i)(achievement|award|honor|recognition)",
|
141 |
-
"languages": r"(?i)(language|linguistic)",
|
142 |
-
"volunteer": r"(?i)(volunteer|community|charity)"
|
143 |
}
|
144 |
|
145 |
# Split text into lines for processing
|
|
|
136 |
"education": r"(?i)(education|academic|university|college|school)",
|
137 |
"skills": r"(?i)(skills|competencies|technologies|technical)",
|
138 |
"certifications": r"(?i)(certification|certificate|license)",
|
|
|
|
|
|
|
|
|
139 |
}
|
140 |
|
141 |
# Split text into lines for processing
|
tests/test_context_acquisition.py
CHANGED
@@ -1,3 +1,196 @@
|
|
1 |
"""
|
2 |
Unit tests for the context_acquisition module.
|
3 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
"""
|
2 |
Unit tests for the context_acquisition module.
|
3 |
"""
|
4 |
+
|
5 |
+
import unittest
|
6 |
+
import tempfile
|
7 |
+
import os
|
8 |
+
from unittest.mock import patch, MagicMock
|
9 |
+
from functions import context_acquisition as ca
|
10 |
+
|
11 |
+
|
12 |
+
class TestCleanExtractedText(unittest.TestCase):
|
13 |
+
"""Test cases for the _clean_extracted_text function."""
|
14 |
+
|
15 |
+
def test_normalize_multiple_newlines(self):
|
16 |
+
"""Test normalization of multiple newlines."""
|
17 |
+
raw = "Line 1\n\nLine 2\n\n\nLine 3"
|
18 |
+
expected = "Line 1\nLine 2\nLine 3"
|
19 |
+
self.assertEqual(ca._clean_extracted_text(raw), expected)
|
20 |
+
|
21 |
+
def test_remove_artifacts(self):
|
22 |
+
"""Test removal of PDF artifacts."""
|
23 |
+
raw = " 123 \n|---|\nSome text\n"
|
24 |
+
expected = "Some text"
|
25 |
+
self.assertEqual(ca._clean_extracted_text(raw), expected)
|
26 |
+
|
27 |
+
def test_normalize_spaces(self):
|
28 |
+
"""Test normalization of multiple spaces."""
|
29 |
+
raw = "A B C"
|
30 |
+
expected = "A B C"
|
31 |
+
self.assertEqual(ca._clean_extracted_text(raw), expected)
|
32 |
+
|
33 |
+
def test_empty_string(self):
|
34 |
+
"""Test handling of empty string."""
|
35 |
+
self.assertEqual(ca._clean_extracted_text(""), "")
|
36 |
+
|
37 |
+
def test_none_input(self):
|
38 |
+
"""Test handling of None input."""
|
39 |
+
self.assertEqual(ca._clean_extracted_text(None), "")
|
40 |
+
|
41 |
+
|
42 |
+
class TestStructureResumeText(unittest.TestCase):
|
43 |
+
"""Test cases for the _structure_resume_text function."""
|
44 |
+
|
45 |
+
def test_basic_structure(self):
|
46 |
+
"""Test basic resume text structuring."""
|
47 |
+
text = "Contact Info\nJohn Doe\nSummary\nExperienced dev\nExperience\nCompany X\nEducation\nMIT\nSkills\nPython, C++"
|
48 |
+
result = ca._structure_resume_text(text)
|
49 |
+
|
50 |
+
self.assertIn("contact_info", result["sections"])
|
51 |
+
self.assertIn("summary", result["sections"])
|
52 |
+
self.assertIn("experience", result["sections"])
|
53 |
+
self.assertIn("education", result["sections"])
|
54 |
+
self.assertIn("skills", result["sections"])
|
55 |
+
self.assertGreater(result["word_count"], 0)
|
56 |
+
self.assertGreaterEqual(result["section_count"], 5)
|
57 |
+
|
58 |
+
def test_empty_text(self):
|
59 |
+
"""Test handling of empty text."""
|
60 |
+
result = ca._structure_resume_text("")
|
61 |
+
self.assertEqual(result["sections"], {})
|
62 |
+
self.assertEqual(result["full_text"], "")
|
63 |
+
self.assertEqual(result["word_count"], 0)
|
64 |
+
self.assertEqual(result["section_count"], 0)
|
65 |
+
|
66 |
+
def test_contains_required_fields(self):
|
67 |
+
"""Test that result contains all required fields."""
|
68 |
+
text = "Some basic text"
|
69 |
+
result = ca._structure_resume_text(text)
|
70 |
+
|
71 |
+
required_fields = ["sections", "full_text", "llm_formatted", "summary",
|
72 |
+
"format", "word_count", "section_count"]
|
73 |
+
for field in required_fields:
|
74 |
+
self.assertIn(field, result)
|
75 |
+
|
76 |
+
|
77 |
+
class TestFormatForLLM(unittest.TestCase):
|
78 |
+
"""Test cases for the _format_for_llm function."""
|
79 |
+
|
80 |
+
def test_section_formatting(self):
|
81 |
+
"""Test proper formatting of sections for LLM."""
|
82 |
+
sections = {
|
83 |
+
"summary": "A summary.",
|
84 |
+
"contact_info": "Contact details.",
|
85 |
+
"experience": "Work exp.",
|
86 |
+
"education": "School info.",
|
87 |
+
"skills": "Python, C++"
|
88 |
+
}
|
89 |
+
full_text = "..."
|
90 |
+
formatted = ca._format_for_llm(sections, full_text)
|
91 |
+
|
92 |
+
self.assertIn("[SUMMARY]", formatted)
|
93 |
+
self.assertIn("[CONTACT INFO]", formatted)
|
94 |
+
self.assertIn("[EXPERIENCE]", formatted)
|
95 |
+
self.assertIn("[EDUCATION]", formatted)
|
96 |
+
self.assertIn("[SKILLS]", formatted)
|
97 |
+
self.assertTrue(formatted.startswith("=== RESUME CONTENT ==="))
|
98 |
+
self.assertTrue(formatted.endswith("=== END RESUME ==="))
|
99 |
+
|
100 |
+
def test_empty_sections(self):
|
101 |
+
"""Test handling of empty sections."""
|
102 |
+
sections = {}
|
103 |
+
full_text = "test"
|
104 |
+
formatted = ca._format_for_llm(sections, full_text)
|
105 |
+
|
106 |
+
self.assertTrue(formatted.startswith("=== RESUME CONTENT ==="))
|
107 |
+
self.assertTrue(formatted.endswith("=== END RESUME ==="))
|
108 |
+
|
109 |
+
|
110 |
+
class TestGetLLMContextFromResume(unittest.TestCase):
|
111 |
+
"""Test cases for the get_llm_context_from_resume function."""
|
112 |
+
|
113 |
+
def test_success_with_llm_formatted(self):
|
114 |
+
"""Test successful extraction with LLM formatted text."""
|
115 |
+
extraction_result = {
|
116 |
+
"status": "success",
|
117 |
+
"structured_text": {"llm_formatted": "LLM text", "full_text": "Full text"}
|
118 |
+
}
|
119 |
+
result = ca.get_llm_context_from_resume(extraction_result)
|
120 |
+
self.assertEqual(result, "LLM text")
|
121 |
+
|
122 |
+
def test_fallback_to_full_text(self):
|
123 |
+
"""Test fallback to full text when LLM formatted not available."""
|
124 |
+
extraction_result = {
|
125 |
+
"status": "success",
|
126 |
+
"structured_text": {"full_text": "Full text"}
|
127 |
+
}
|
128 |
+
result = ca.get_llm_context_from_resume(extraction_result)
|
129 |
+
self.assertEqual(result, "Full text")
|
130 |
+
|
131 |
+
def test_error_status(self):
|
132 |
+
"""Test handling of error status."""
|
133 |
+
extraction_result = {"status": "error"}
|
134 |
+
result = ca.get_llm_context_from_resume(extraction_result)
|
135 |
+
self.assertEqual(result, "")
|
136 |
+
|
137 |
+
def test_missing_structured_text(self):
|
138 |
+
"""Test handling of missing structured_text."""
|
139 |
+
extraction_result = {"status": "success"}
|
140 |
+
result = ca.get_llm_context_from_resume(extraction_result)
|
141 |
+
self.assertEqual(result, "")
|
142 |
+
|
143 |
+
|
144 |
+
class TestExtractTextFromLinkedInPDF(unittest.TestCase):
|
145 |
+
"""Test cases for the extract_text_from_linkedin_pdf function."""
|
146 |
+
|
147 |
+
def test_none_input(self):
|
148 |
+
"""Test handling of None input."""
|
149 |
+
result = ca.extract_text_from_linkedin_pdf(None)
|
150 |
+
self.assertEqual(result["status"], "error")
|
151 |
+
self.assertIn("No PDF file provided", result["message"])
|
152 |
+
|
153 |
+
@patch('PyPDF2.PdfReader')
|
154 |
+
@patch('builtins.open')
|
155 |
+
def test_successful_extraction(self, mock_open, mock_pdf_reader):
|
156 |
+
"""Test successful PDF text extraction with mocked PyPDF2."""
|
157 |
+
# Create a temporary file
|
158 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
|
159 |
+
tmp_path = tmp.name
|
160 |
+
|
161 |
+
try:
|
162 |
+
# Mock file reading
|
163 |
+
mock_file = MagicMock()
|
164 |
+
mock_file.read.return_value = b"fake pdf content"
|
165 |
+
mock_open.return_value.__enter__.return_value = mock_file
|
166 |
+
|
167 |
+
# Mock PDF reader and page
|
168 |
+
mock_page = MagicMock()
|
169 |
+
mock_page.extract_text.return_value = "Contact Info\nJohn Doe\nSummary\nDeveloper\nExperience\nCompany X"
|
170 |
+
|
171 |
+
mock_reader_instance = MagicMock()
|
172 |
+
mock_reader_instance.pages = [mock_page]
|
173 |
+
mock_pdf_reader.return_value = mock_reader_instance
|
174 |
+
|
175 |
+
# Test the function
|
176 |
+
result = ca.extract_text_from_linkedin_pdf(tmp_path)
|
177 |
+
|
178 |
+
self.assertEqual(result["status"], "success")
|
179 |
+
self.assertIn("structured_text", result)
|
180 |
+
self.assertIn("metadata", result)
|
181 |
+
self.assertIn("contact_info", result["structured_text"]["sections"])
|
182 |
+
|
183 |
+
finally:
|
184 |
+
# Clean up
|
185 |
+
if os.path.exists(tmp_path):
|
186 |
+
os.remove(tmp_path)
|
187 |
+
|
188 |
+
def test_nonexistent_file(self):
|
189 |
+
"""Test handling of non-existent file."""
|
190 |
+
result = ca.extract_text_from_linkedin_pdf("/nonexistent/path.pdf")
|
191 |
+
self.assertEqual(result["status"], "error")
|
192 |
+
self.assertIn("Failed to extract text from PDF", result["message"])
|
193 |
+
|
194 |
+
|
195 |
+
if __name__ == '__main__':
|
196 |
+
unittest.main()
|