ruslanmv commited on
Commit
4813a0f
·
1 Parent(s): a74397e

First commit

Browse files
Files changed (2) hide show
  1. app.py +161 -0
  2. requirements.txt +2 -0
app.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ import PyPDF2
4
+ import docx2txt
5
+ import logging
6
+
7
+ import nltk
8
+ from nltk.corpus import stopwords
9
+ from nltk.tokenize import word_tokenize
10
+
11
+ from sklearn.feature_extraction.text import TfidfVectorizer
12
+ from sklearn.metrics.pairwise import cosine_similarity
13
+
14
+ # Configure logging
15
+ logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
16
+
17
+ # ----------------------------------------------------------------------------
18
+ # 1) Utility Functions: Parsing & Preprocessing
19
+ # ----------------------------------------------------------------------------
20
+
21
+ def extract_text_from_pdf(file_obj):
22
+ """Extract all text from a PDF file object."""
23
+ text_content = []
24
+ try:
25
+ logging.info("Loading PDF file.")
26
+ pdf_reader = PyPDF2.PdfReader(file_obj)
27
+ for page in pdf_reader.pages:
28
+ page_text = page.extract_text()
29
+ if page_text:
30
+ text_content.append(page_text)
31
+ extracted_text = "\n".join(text_content)
32
+ logging.info(f"Extracted PDF content: {extracted_text[:500]}...")
33
+
34
+ print(extracted_text) # Print the extracted text
35
+
36
+ return extracted_text
37
+ except Exception as e:
38
+ logging.error(f"Error reading PDF: {e}")
39
+ return f"Error reading PDF: {e}"
40
+
41
+ def extract_text_from_docx(file_path):
42
+ """Extract all text from a DOCX file on disk."""
43
+ try:
44
+ logging.info("Loading DOCX file.")
45
+ extracted_text = docx2txt.process(file_path)
46
+ logging.info(f"Extracted DOCX content: {extracted_text[:500]}...")
47
+
48
+ print(extracted_text) # Print the extracted text
49
+
50
+ return extracted_text
51
+ except Exception as e:
52
+ logging.error(f"Error reading DOCX: {e}")
53
+ return f"Error reading DOCX: {e}"
54
+
55
+ def extract_text_from_txt(file_obj):
56
+ """Extract all text from a TXT file object."""
57
+ try:
58
+ logging.info("Loading TXT file.")
59
+ extracted_text = file_obj.read().decode("utf-8", errors="ignore")
60
+ logging.info(f"Extracted TXT content: {extracted_text[:500]}...")
61
+
62
+ print(extracted_text) # Print the extracted text
63
+
64
+ return extracted_text
65
+ except Exception as e:
66
+ logging.error(f"Error reading TXT: {e}")
67
+ return f"Error reading TXT: {e}"
68
+
69
+ def preprocess_text(text):
70
+ """
71
+ Lowercase, tokenize, remove stopwords and non-alphabetic tokens,
72
+ and then rejoin into a clean string.
73
+ """
74
+ logging.info("Preprocessing text.")
75
+ text = str(text).lower()
76
+ tokens = word_tokenize(text)
77
+ stop_words = set(stopwords.words('english'))
78
+ filtered_tokens = [t for t in tokens if t.isalpha() and t not in stop_words]
79
+ processed_text = " ".join(filtered_tokens)
80
+ logging.info(f"Preprocessed text: {processed_text[:500]}...")
81
+ return processed_text
82
+
83
+ # ----------------------------------------------------------------------------
84
+ # 2) Core Ranking Logic with TF-IDF & Cosine Similarity
85
+ # ----------------------------------------------------------------------------
86
+
87
+ def rank_resumes_with_tfidf(job_description: str, resumes: dict):
88
+ logging.info("Ranking resumes using TF-IDF.")
89
+ preprocessed_jd = preprocess_text(job_description)
90
+ preprocessed_resumes = {fname: preprocess_text(txt) for fname, txt in resumes.items()}
91
+ corpus = [preprocessed_jd] + list(preprocessed_resumes.values())
92
+ filenames = list(preprocessed_resumes.keys())
93
+ vectorizer = TfidfVectorizer()
94
+ tfidf_matrix = vectorizer.fit_transform(corpus)
95
+ jd_vector = tfidf_matrix[0:1]
96
+ resume_vectors = tfidf_matrix[1:]
97
+ similarities = cosine_similarity(jd_vector, resume_vectors).flatten()
98
+ results = list(zip(filenames, similarities))
99
+ results_sorted = sorted(results, key=lambda x: x[1], reverse=True)
100
+ logging.info(f"Ranking completed: {results_sorted}")
101
+ return results_sorted
102
+
103
+ # ----------------------------------------------------------------------------
104
+ # 3) Gradio Callback Function
105
+ # ----------------------------------------------------------------------------
106
+
107
+ def analyze_cvs(job_description, cv_files):
108
+ logging.info("Starting CV analysis.")
109
+ resumes_data = {}
110
+
111
+ for uploaded_file in cv_files:
112
+
113
+ filename = os.path.basename(uploaded_file.name) #Get the base name, handling potential Gradio changes
114
+
115
+ file_ext = os.path.splitext(filename)[1].lower()
116
+ temp_filepath = None
117
+
118
+ try:
119
+ logging.info(f"Processing file: {filename}")
120
+ if file_ext == ".pdf":
121
+ with open(uploaded_file.name, "rb") as f: # Open the temporary file created by gradio
122
+ file_content = extract_text_from_pdf(f)
123
+ elif file_ext == ".txt":
124
+ with open(uploaded_file.name, "rb") as f: # Open the temporary file created by gradio
125
+ file_content = extract_text_from_txt(f)
126
+ elif file_ext == ".docx":
127
+ file_content = extract_text_from_docx(uploaded_file.name) #docx2txt can handle the temporary filepath
128
+ else:
129
+ file_content = "Unsupported file type."
130
+ except Exception as e:
131
+ logging.error(f"Error processing file: {e}")
132
+ file_content = f"Error processing file: {e}"
133
+
134
+ logging.info(f"Extracted CV Content ({filename}): {file_content[:500]}...")
135
+ resumes_data[filename] = file_content
136
+
137
+ ranked_results = rank_resumes_with_tfidf(job_description, resumes_data)
138
+ display_data = [[filename, round(float(score), 3)] for filename, score in ranked_results]
139
+ logging.info("Analysis completed successfully.")
140
+ return display_data
141
+
142
+ # ----------------------------------------------------------------------------
143
+ # 4) Gradio Interface
144
+ # ----------------------------------------------------------------------------
145
+
146
+ def create_gradio_interface():
147
+ job_description_input = gr.Textbox(label="Job Description", placeholder="Describe the role here...", lines=4)
148
+ cv_input = gr.File(label="Upload resumes (PDF/DOCX/TXT)", file_count="multiple", type="filepath")
149
+ results_output = gr.Dataframe(headers=["Candidate CV", "Similarity Score"], label="Ranked Candidates")
150
+ demo = gr.Interface(fn=analyze_cvs, inputs=[job_description_input, cv_input], outputs=[results_output], title="Resume Ranking with TF-IDF")
151
+ return demo
152
+
153
+ # ----------------------------------------------------------------------------
154
+ # 5) Main Script
155
+ # ----------------------------------------------------------------------------
156
+
157
+ if __name__ == "__main__":
158
+ nltk.download('punkt', quiet=True)
159
+ nltk.download('stopwords', quiet=True)
160
+ app = create_gradio_interface()
161
+ app.launch(server_name="0.0.0.0", server_port=7860, debug=True)
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ PyPDF2
2
+ docx2txt