Spaces:
Sleeping
Sleeping
Commit
·
e9f8bde
1
Parent(s):
319051d
Upload 5 files
Browse files- app.py +72 -0
- constants (1).py +3 -0
- env-example.txt +2 -0
- requirements.txt +12 -0
- utils.py +110 -0
app.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import constants
|
| 3 |
+
from utils import *
|
| 4 |
+
import uuid
|
| 5 |
+
|
| 6 |
+
#Creating session variables
|
| 7 |
+
if 'unique_id' not in st.session_state:
|
| 8 |
+
st.session_state['unique_id'] =''
|
| 9 |
+
|
| 10 |
+
def main():
|
| 11 |
+
|
| 12 |
+
st.set_page_config(page_title="Resume Screening Assistance")
|
| 13 |
+
st.title("HR - Resume Screening Assistance...💁 ")
|
| 14 |
+
st.subheader("I can help you in resume screening process")
|
| 15 |
+
|
| 16 |
+
job_description = st.text_area("Please paste the 'JOB DESCRIPTION' here...",key="1")
|
| 17 |
+
document_count = st.text_input("No.of 'RESUMES' to return",key="2")
|
| 18 |
+
# Upload the Resumes (pdf files)
|
| 19 |
+
pdf = st.file_uploader("Upload resumes here, only PDF files allowed", type=["pdf"],accept_multiple_files=True)
|
| 20 |
+
|
| 21 |
+
submit=st.button("Help me with the analysis")
|
| 22 |
+
|
| 23 |
+
if submit:
|
| 24 |
+
with st.spinner('Wait for it...'):
|
| 25 |
+
|
| 26 |
+
#Creating a unique ID, so that we can use to query and get only the user uploaded documents from PINECONE vector store
|
| 27 |
+
st.session_state['unique_id']=uuid.uuid4().hex
|
| 28 |
+
|
| 29 |
+
#Create a documents list out of all the user uploaded pdf files
|
| 30 |
+
final_docs_list=create_docs(pdf,st.session_state['unique_id'])
|
| 31 |
+
#st.write(final_docs_list)
|
| 32 |
+
|
| 33 |
+
#Displaying the count of resumes that have been uploaded
|
| 34 |
+
st.write("*Resumes uploaded* :"+str(len(final_docs_list)))
|
| 35 |
+
|
| 36 |
+
#Create embeddings instance
|
| 37 |
+
embeddings=create_embeddings_load_data()
|
| 38 |
+
|
| 39 |
+
#Push data to PINECONE
|
| 40 |
+
#push_to_pinecone(constants.PINECONE_API_KEY,constants.PINECONE_ENVIRONMENT,constants.PINECONE_INDEX,embeddings,final_docs_list)
|
| 41 |
+
|
| 42 |
+
#Fecth relavant documents from PINECONE
|
| 43 |
+
#relavant_docs=similar_docs(job_description,document_count,constants.PINECONE_API_KEY,constants.PINECONE_ENVIRONMENT,constants.PINECONE_INDEX,embeddings,st.session_state['unique_id'])
|
| 44 |
+
relavant_docs=close_matches(job_description,document_count,final_docs_list,embeddings)
|
| 45 |
+
#st.write(relavant_docs)
|
| 46 |
+
|
| 47 |
+
#Introducing a line separator
|
| 48 |
+
st.write(":heavy_minus_sign:" * 30)
|
| 49 |
+
|
| 50 |
+
#For each item in relavant docs - we are displaying some info of it on the UI
|
| 51 |
+
for item in range(len(relavant_docs)):
|
| 52 |
+
|
| 53 |
+
st.subheader("👉 "+str(item+1))
|
| 54 |
+
|
| 55 |
+
#Displaying Filepath
|
| 56 |
+
st.write("**File** : "+relavant_docs[item][0].metadata['name'])
|
| 57 |
+
|
| 58 |
+
#Introducing Expander feature
|
| 59 |
+
with st.expander('Show me 👀'):
|
| 60 |
+
st.info("**Match Score** : "+ str(1 - relavant_docs[item][1]))
|
| 61 |
+
#st.write("***"+relavant_docs[item][0].page_content)
|
| 62 |
+
|
| 63 |
+
#Gets the summary of the current item using 'get_summary' function that we have created which uses LLM & Langchain chain
|
| 64 |
+
summary = get_summary(relavant_docs[item][0])
|
| 65 |
+
st.write("**Summary** : "+summary)
|
| 66 |
+
|
| 67 |
+
st.success("Hope I was able to save your time❤️")
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
#Invoking main function
|
| 71 |
+
if __name__ == '__main__':
|
| 72 |
+
main()
|
constants (1).py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
PINECONE_API_KEY="a4405723-2309-4c5c-87d0-760f461fdef0"
|
| 2 |
+
PINECONE_ENVIRONMENT="gcp-starter"
|
| 3 |
+
PINECONE_INDEX="hresume"
|
env-example.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
OPENAI_API_KEY=""
|
| 2 |
+
HUGGINGFACEHUB_API_TOKEN=""
|
requirements.txt
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
langchain
|
| 2 |
+
streamlit
|
| 3 |
+
openai
|
| 4 |
+
tiktoken
|
| 5 |
+
python-dotenv
|
| 6 |
+
unstructured
|
| 7 |
+
pinecone-client
|
| 8 |
+
pypdf
|
| 9 |
+
sentence_transformers
|
| 10 |
+
pdf2image
|
| 11 |
+
pdfminer.six
|
| 12 |
+
faiss-cpu
|
utils.py
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import openai
|
| 2 |
+
from langchain.embeddings.openai import OpenAIEmbeddings
|
| 3 |
+
from langchain.vectorstores import Pinecone
|
| 4 |
+
from langchain.llms import OpenAI
|
| 5 |
+
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
|
| 6 |
+
from langchain.schema import Document
|
| 7 |
+
import pinecone
|
| 8 |
+
from langchain.vectorstores import FAISS
|
| 9 |
+
from pypdf import PdfReader
|
| 10 |
+
from langchain.llms.openai import OpenAI
|
| 11 |
+
from langchain.chains.summarize import load_summarize_chain
|
| 12 |
+
from langchain import HuggingFaceHub
|
| 13 |
+
from langchain.document_loaders import DirectoryLoader
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
#Extract Information from PDF file
|
| 17 |
+
def get_pdf_text(pdf_doc):
|
| 18 |
+
text = ""
|
| 19 |
+
pdf_reader = PdfReader(pdf_doc)
|
| 20 |
+
for page in pdf_reader.pages:
|
| 21 |
+
text += page.extract_text()
|
| 22 |
+
return text
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
# iterate over files in
|
| 27 |
+
# that user uploaded PDF files, one by one
|
| 28 |
+
def create_docs(user_pdf_list, unique_id):
|
| 29 |
+
docs=[]
|
| 30 |
+
for filename in user_pdf_list:
|
| 31 |
+
|
| 32 |
+
chunks=get_pdf_text(filename)
|
| 33 |
+
|
| 34 |
+
#Adding items to our list - Adding data & its metadata
|
| 35 |
+
docs.append(Document(
|
| 36 |
+
page_content=chunks,
|
| 37 |
+
metadata={"name": filename.name,"id":filename.id,"type=":filename.type,"size":filename.size,"unique_id":unique_id},
|
| 38 |
+
))
|
| 39 |
+
# Load Files from Directory (Local Version)
|
| 40 |
+
#loader = DirectoryLoader('./Repository', glob='**/*')
|
| 41 |
+
#docs1 = loader.load()
|
| 42 |
+
#final_docs = docs + docs1
|
| 43 |
+
return docs
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
#Create embeddings instance
|
| 47 |
+
def create_embeddings_load_data():
|
| 48 |
+
embeddings = OpenAIEmbeddings()
|
| 49 |
+
#embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
|
| 50 |
+
return embeddings
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
#Function to push data to Vector Store - Pinecone here
|
| 54 |
+
def push_to_pinecone(pinecone_apikey,pinecone_environment,pinecone_index_name,embeddings,docs):
|
| 55 |
+
|
| 56 |
+
pinecone.init(
|
| 57 |
+
api_key=pinecone_apikey,
|
| 58 |
+
environment=pinecone_environment
|
| 59 |
+
)
|
| 60 |
+
print("done......2")
|
| 61 |
+
Pinecone.from_documents(docs, embeddings, index_name=pinecone_index_name)
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
#Function to pull infrmation from Vector Store - Pinecone here
|
| 66 |
+
def pull_from_pinecone(pinecone_apikey,pinecone_environment,pinecone_index_name,embeddings):
|
| 67 |
+
|
| 68 |
+
pinecone.init(
|
| 69 |
+
api_key=pinecone_apikey,
|
| 70 |
+
environment=pinecone_environment
|
| 71 |
+
)
|
| 72 |
+
|
| 73 |
+
index_name = pinecone_index_name
|
| 74 |
+
|
| 75 |
+
index = Pinecone.from_existing_index(index_name, embeddings)
|
| 76 |
+
return index
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
#Function to help us get relavant documents from vector store - based on user input
|
| 81 |
+
def similar_docs(query,k,pinecone_apikey,pinecone_environment,pinecone_index_name,embeddings,unique_id):
|
| 82 |
+
|
| 83 |
+
pinecone.init(
|
| 84 |
+
api_key=pinecone_apikey,
|
| 85 |
+
environment=pinecone_environment
|
| 86 |
+
)
|
| 87 |
+
|
| 88 |
+
index_name = pinecone_index_name
|
| 89 |
+
|
| 90 |
+
index = pull_from_pinecone(pinecone_apikey,pinecone_environment,index_name,embeddings)
|
| 91 |
+
#similar_docs = index.similarity_search_with_score(query, int(k),{"unique_id":unique_id})
|
| 92 |
+
similar_docs = index.similarity_search_with_score(query, int(k))
|
| 93 |
+
#print(similar_docs)
|
| 94 |
+
return similar_docs
|
| 95 |
+
|
| 96 |
+
def close_matches(query,k,docs,embeddings):
|
| 97 |
+
#https://api.python.langchain.com/en/latest/vectorstores/langchain.vectorstores.faiss.FAISS.html#langchain.vectorstores.faiss.FAISS.similarity_search_with_score
|
| 98 |
+
db = FAISS.from_documents(docs, embeddings)
|
| 99 |
+
similar_docs = db.similarity_search_with_score(query, int(k))
|
| 100 |
+
return similar_docs
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
# Helps us get the summary of a document
|
| 104 |
+
def get_summary(current_doc):
|
| 105 |
+
llm = OpenAI(temperature=0)
|
| 106 |
+
#llm = HuggingFaceHub(repo_id="bigscience/bloom", model_kwargs={"temperature":1e-10})
|
| 107 |
+
chain = load_summarize_chain(llm, chain_type="map_reduce")
|
| 108 |
+
summary = chain.run([current_doc])
|
| 109 |
+
|
| 110 |
+
return summary
|