Spaces:

ruslanmv
/

CV_Ranking

Sleeping

App Files Files Community

ruslanmv commited on Feb 4

Commit

4813a0f

1 Parent(s): a74397e

First commit

Browse files

Files changed (2) hide show

app.py +161 -0
requirements.txt +2 -0

app.py ADDED Viewed

	@@ -0,0 +1,161 @@

+import os
+import gradio as gr
+import PyPDF2
+import docx2txt
+import logging
+import nltk
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+# Configure logging
+logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
+# ----------------------------------------------------------------------------
+# 1) Utility Functions: Parsing & Preprocessing
+# ----------------------------------------------------------------------------
+def extract_text_from_pdf(file_obj):
+    """Extract all text from a PDF file object."""
+    text_content = []
+    try:
+        logging.info("Loading PDF file.")
+        pdf_reader = PyPDF2.PdfReader(file_obj)
+        for page in pdf_reader.pages:
+            page_text = page.extract_text()
+            if page_text:
+                text_content.append(page_text)
+        extracted_text = "\n".join(text_content)
+        logging.info(f"Extracted PDF content: {extracted_text[:500]}...")
+        print(extracted_text)  # Print the extracted text
+        return extracted_text
+    except Exception as e:
+        logging.error(f"Error reading PDF: {e}")
+        return f"Error reading PDF: {e}"
+def extract_text_from_docx(file_path):
+    """Extract all text from a DOCX file on disk."""
+    try:
+        logging.info("Loading DOCX file.")
+        extracted_text = docx2txt.process(file_path)
+        logging.info(f"Extracted DOCX content: {extracted_text[:500]}...")
+        print(extracted_text) # Print the extracted text
+        return extracted_text
+    except Exception as e:
+        logging.error(f"Error reading DOCX: {e}")
+        return f"Error reading DOCX: {e}"
+def extract_text_from_txt(file_obj):
+    """Extract all text from a TXT file object."""
+    try:
+        logging.info("Loading TXT file.")
+        extracted_text = file_obj.read().decode("utf-8", errors="ignore")
+        logging.info(f"Extracted TXT content: {extracted_text[:500]}...")
+        print(extracted_text) # Print the extracted text
+        return extracted_text
+    except Exception as e:
+        logging.error(f"Error reading TXT: {e}")
+        return f"Error reading TXT: {e}"
+def preprocess_text(text):
+    """
+    Lowercase, tokenize, remove stopwords and non-alphabetic tokens,
+    and then rejoin into a clean string.
+    """
+    logging.info("Preprocessing text.")
+    text = str(text).lower()
+    tokens = word_tokenize(text)
+    stop_words = set(stopwords.words('english'))
+    filtered_tokens = [t for t in tokens if t.isalpha() and t not in stop_words]
+    processed_text = " ".join(filtered_tokens)
+    logging.info(f"Preprocessed text: {processed_text[:500]}...")
+    return processed_text
+# ----------------------------------------------------------------------------
+# 2) Core Ranking Logic with TF-IDF & Cosine Similarity
+# ----------------------------------------------------------------------------
+def rank_resumes_with_tfidf(job_description: str, resumes: dict):
+    logging.info("Ranking resumes using TF-IDF.")
+    preprocessed_jd = preprocess_text(job_description)
+    preprocessed_resumes = {fname: preprocess_text(txt) for fname, txt in resumes.items()}
+    corpus = [preprocessed_jd] + list(preprocessed_resumes.values())
+    filenames = list(preprocessed_resumes.keys())
+    vectorizer = TfidfVectorizer()
+    tfidf_matrix = vectorizer.fit_transform(corpus)
+    jd_vector = tfidf_matrix[0:1]
+    resume_vectors = tfidf_matrix[1:]
+    similarities = cosine_similarity(jd_vector, resume_vectors).flatten()
+    results = list(zip(filenames, similarities))
+    results_sorted = sorted(results, key=lambda x: x[1], reverse=True)
+    logging.info(f"Ranking completed: {results_sorted}")
+    return results_sorted
+# ----------------------------------------------------------------------------
+# 3) Gradio Callback Function
+# ----------------------------------------------------------------------------
+def analyze_cvs(job_description, cv_files):
+    logging.info("Starting CV analysis.")
+    resumes_data = {}
+    for uploaded_file in cv_files:
+        filename = os.path.basename(uploaded_file.name) #Get the base name, handling potential Gradio changes
+        file_ext = os.path.splitext(filename)[1].lower()
+        temp_filepath = None
+        try:
+            logging.info(f"Processing file: {filename}")
+            if file_ext == ".pdf":
+                with open(uploaded_file.name, "rb") as f: # Open the temporary file created by gradio
+                    file_content = extract_text_from_pdf(f)
+            elif file_ext == ".txt":
+                with open(uploaded_file.name, "rb") as f: # Open the temporary file created by gradio
+                    file_content = extract_text_from_txt(f)
+            elif file_ext == ".docx":
+                file_content = extract_text_from_docx(uploaded_file.name) #docx2txt can handle the temporary filepath
+            else:
+                file_content = "Unsupported file type."
+        except Exception as e:
+            logging.error(f"Error processing file: {e}")
+            file_content = f"Error processing file: {e}"
+        logging.info(f"Extracted CV Content ({filename}): {file_content[:500]}...")
+        resumes_data[filename] = file_content
+    ranked_results = rank_resumes_with_tfidf(job_description, resumes_data)
+    display_data = [[filename, round(float(score), 3)] for filename, score in ranked_results]
+    logging.info("Analysis completed successfully.")
+    return display_data
+# ----------------------------------------------------------------------------
+# 4) Gradio Interface
+# ----------------------------------------------------------------------------
+def create_gradio_interface():
+    job_description_input = gr.Textbox(label="Job Description", placeholder="Describe the role here...", lines=4)
+    cv_input = gr.File(label="Upload resumes (PDF/DOCX/TXT)", file_count="multiple", type="filepath")
+    results_output = gr.Dataframe(headers=["Candidate CV", "Similarity Score"], label="Ranked Candidates")
+    demo = gr.Interface(fn=analyze_cvs, inputs=[job_description_input, cv_input], outputs=[results_output], title="Resume Ranking with TF-IDF")
+    return demo
+# ----------------------------------------------------------------------------
+# 5) Main Script
+# ----------------------------------------------------------------------------
+if __name__ == "__main__":
+    nltk.download('punkt', quiet=True)
+    nltk.download('stopwords', quiet=True)
+    app = create_gradio_interface()
+    app.launch(server_name="0.0.0.0", server_port=7860, debug=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ PyPDF2
2	+ docx2txt