khrek commited on
Commit
0d375ed
·
1 Parent(s): ca25308

Upload 8 files

Browse files
Files changed (8) hide show
  1. app.py +16 -0
  2. models.py +49 -0
  3. output_model.py +32 -0
  4. parser.py +125 -0
  5. reader.py +25 -0
  6. requirements.txt +483 -0
  7. sections.json +127 -0
  8. segmenter.py +105 -0
app.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydoc import describe
2
+ import gradio
3
+ from main import Main
4
+
5
+ main = Main()
6
+
7
+ def parse(cv):
8
+ return main.parse(cv.name)
9
+
10
+ description = "This is a demo of the resume parser. \
11
+ Upload a resume and it will return a JSON object with a detailed parsed resume data."
12
+ article = "Demo of detailed resume parser"
13
+ file_input = gradio.inputs.File(file_count="single", type="file", label="Upload your pdf resume (en)")
14
+ iface = gradio.Interface(fn=parse, inputs=file_input, outputs="json",
15
+ title="Detailed Resume Parser", description=description, article=article)
16
+ iface.launch()
models.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import sentencepiece
3
+ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
4
+ from langchain import PromptTemplate, LLMChain, HuggingFacePipeline
5
+ import ast
6
+ class Models():
7
+ def __init__(self) -> None:
8
+ self.template = """
9
+ A virtual assistant answers questions from a user based on the provided text.
10
+ USER: Text: {input_text}
11
+ ASSISTANT: I’ve read this text.
12
+ USER: What describes {entity_type} in the text?
13
+ ASSISTANT:
14
+ """
15
+ self.load_trained_models()
16
+
17
+ def load_trained_models(self):
18
+ #is it best to keep in memory why not pickle?
19
+ checkpoint = "Universal-NER/UniNER-7B-all"
20
+
21
+ ner_model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map="auto", torch_dtype=torch.float16, offload_folder="offload", offload_state_dict = True)
22
+ tokenizer = AutoTokenizer.from_pretrained("Universal-NER/UniNER-7B-all", use_fast=False, padding="max_length")
23
+ pipeline = pipeline(
24
+ "text-generation", #task
25
+ model=ner_model,
26
+ max_length=1000,
27
+ tokenizer=tokenizer,
28
+ trust_remote_code=True,
29
+ do_sample=True,
30
+ top_k=10,
31
+ num_return_sequences=1
32
+ )
33
+
34
+ self.llm = HuggingFacePipeline(pipeline = pipeline, model_kwargs = {'temperature':0})
35
+ self.prompt = PromptTemplate(template=self.template, input_variables=["input_text","entity_type"])
36
+ self.llm_chain = LLMChain(prompt=self.prompt, llm=self.llm)
37
+
38
+ def extract_ner(self, context, entity_type):
39
+ return ast.literal_eval(self.llm_chain.run({"input_text":context,"entity_type":entity_type}))
40
+
41
+ def get_ner(self, clean_lines, entity):
42
+ tokens = []
43
+ try_num = 0
44
+ while try_num < 5 and tokens == []:
45
+ tokens = self.extract_ner(' '.join(clean_lines), entity)
46
+ if len(tokens) == 0:
47
+ raise ValueError("Couldnt extract {entity}")
48
+ return tokens
49
+
output_model.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, Dict, List, Optional, Union
4
+
5
+ from pydantic import BaseModel, Field
6
+
7
+ class Work_experience(BaseModel):
8
+ position:List[str]
9
+ company:List[str]
10
+ start_date:Optional[str] = ""
11
+ end_date:Optional[str] = ""
12
+ description:Optional[str] = ""
13
+ location:Optional[List[str]] = []
14
+
15
+
16
+ class Education(BaseModel):
17
+ degree:str = ""
18
+ major:List[str] = []
19
+ university:List[str] = []
20
+ start_date:Optional[str] = ""
21
+ end_date:Optional[str] = ""
22
+ location:Optional[List[str]] = []
23
+
24
+ class Basic_info(BaseModel):
25
+ name: str
26
+ email : Optional[str] = ""
27
+ phone: Optional[str] = ""
28
+
29
+ class ModelOutput(BaseModel):
30
+ basic_info: Basic_info
31
+ education: Optional[List[Education]] = None
32
+ work_experience: Optional[List[Work_experience]] = None
parser.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from itertools import chain
2
+ from models.prototype.models import Models
3
+ #from output_model import OutputModel, WorkExperience
4
+ from models.prototype.segmenter import ResumeSegmenter
5
+ from flashtext import KeywordProcessor
6
+ from collections import defaultdict
7
+ class ResumeParser():
8
+ def __init__(self) -> None:
9
+ self.resumeSegmenter = ResumeSegmenter()
10
+ self.models = Models()
11
+
12
+
13
+ def get_date_index(self, clean_resume_lines, date):
14
+ indexes = [i for i, line in enumerate(clean_resume_lines) if date in line]
15
+ return indexes
16
+
17
+ #better suited to a utils file
18
+ def sort_tokens_table(self, tokens_data):
19
+ table = {}
20
+ for key, tokens in tokens_data:
21
+ for token in tokens:
22
+ table[token] = key
23
+ return table
24
+
25
+ def split_work_exp(self, resume_lines, start_index, end_index, work_dates):
26
+
27
+ dates_indexes = [self.get_date_index(resume_lines[start_index:end_index], work_date) for work_date in work_dates]
28
+ dates_indexes = list(chain.from_iterable(dates_indexes))
29
+ dates_indexes = [i + start_index for i in dates_indexes]
30
+ #this list should be unique and ordered
31
+ dates_indexes = sorted([start_index+1] + dates_indexes + [end_index])
32
+ dates_indexes = set(dates_indexes)
33
+ dates_indexes = list(dates_indexes)
34
+
35
+ list_single_work_exp = []
36
+ for i in range(len(dates_indexes)-1):
37
+ index = dates_indexes[i]
38
+ next_index = dates_indexes[i+1]
39
+ section = resume_lines[index:next_index]
40
+ if len(section) == 0:
41
+ continue
42
+ list_single_work_exp.append(section)
43
+ return list_single_work_exp
44
+
45
+ def extract_section_text(self, resume_lines, section_header = "work_and_employment"):
46
+ text_segments, sections = self.resumeSegmenter.get_parsed_sections(resume_lines)
47
+ start_index = sections[section_header][0]
48
+ end_index = sections[section_header][1]
49
+ #on the bases dates would be unique
50
+ return start_index, end_index
51
+
52
+ #more of a utils function
53
+ def sort_tokens_table(tokens_data):
54
+ table = {}
55
+ for key, tokens in tokens_data:
56
+ for token in tokens:
57
+ table[token] = key
58
+ return table
59
+
60
+ def format_output(self, keywords, work_section_list, isWorkExp=True):
61
+ if isWorkExp:
62
+ headlines = [text[0] for text in work_section_list]
63
+ else:
64
+ headlines = work_section_list
65
+ table = self.sort_tokens_table(keywords)
66
+ tokens_processor = KeywordProcessor()
67
+ list_keywords = list(chain.from_iterable([tokens[1] for tokens in keywords]))
68
+ tokens_processor.add_keywords_from_list(list_keywords)
69
+ data = []
70
+ for i, header in enumerate(headlines):
71
+ current_data = defaultdict(list)
72
+ tokens = tokens_processor.extract_keywords(header)
73
+ for token in tokens:
74
+ current_data[table[token]].append(token)
75
+ if isWorkExp:
76
+ current_data["description"] = work_section_list[i][1:]
77
+ data.append(dict(current_data))
78
+ return data
79
+
80
+ def parse_work_history(self, resume_lines):
81
+ start_index, end_index = self.extract_section_text(resume_lines)
82
+ work_dates = self.models.get_ner(resume_lines[start_index:end_index], "date")
83
+ single_work_experiences = self.split_work_exp(resume_lines, start_index, end_index, work_dates)
84
+ job_positions = self.models.get_ner(resume_lines[start_index:end_index], "job title")
85
+ companies = self.models.get_ner(resume_lines[start_index:end_index], "company")
86
+ keywords = [("date", work_dates), ("title", job_positions), ("company", companies)]
87
+ return self.format_output(keywords, single_work_experiences)
88
+
89
+ def parse_education(self, resume_lines):
90
+ start_index, end_index = self.extract_section_text(resume_lines, "education_and_training")
91
+ tokens = ["degree", "university", "degree field", "date", "location"]
92
+
93
+ for token in tokens:
94
+ keywords = self.get_ner(resume_lines[start_index+1:end_index], token)
95
+ output = self.format_output(keywords, resume_lines[start_index:end_index], False)
96
+ output = [res for res in output if res]
97
+
98
+ return output
99
+
100
+ def parse_basic_info(self,resume_lines):
101
+ start_index, end_index = self.extract_section_text(resume_lines, "basics_info")
102
+ #tokens = ["person", "email", "phone"]
103
+ tokens = ["person"]
104
+ for token in tokens:
105
+ keywords = self.models.get_ner(resume_lines[start_index:end_index], token)
106
+
107
+ output = {}
108
+ for token, result in keywords:
109
+ if len(result) > 0:
110
+ output[token] = result[0]
111
+ return output
112
+
113
+ def parse(self, resume_lines):
114
+ jobs = self.parse_work_history(resume_lines)
115
+ education = self.parse_education(resume_lines)
116
+ basic_info = self.parse_basic_info(resume_lines)
117
+
118
+ return {"basic_info":basic_info, "education":education, "work_experience":jobs}
119
+
120
+
121
+
122
+
123
+
124
+
125
+
reader.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pypdfium2 as pdfium
2
+ import re
3
+ class ResumeReader:
4
+
5
+ def clean_text(self, raw_text):
6
+ clean_text = re.sub(r'\n+', '\n', raw_text)
7
+ clean_text = clean_text.replace("\r", "\n")
8
+ clean_text = clean_text.replace("\t", " ")
9
+ clean_text = re.sub(r"\uf0b7", " ", clean_text)
10
+ clean_text = re.sub(r'[^\x00-\x7F]+', '', clean_text) #remove non-ascii
11
+ clean_text = re.sub(r"\(cid:\d{0,3}\)", " ", clean_text)
12
+ clean_text = re.sub(r'• ', " ", clean_text)
13
+ return clean_text
14
+
15
+ def read_pdf(self, path_file):
16
+ raw_text = ""
17
+ pdf = pdfium.PdfDocument(path_file)
18
+ for page in pdf:
19
+ raw_text += page.get_textpage().get_text_range()
20
+ clean_text = self.clean_text(raw_text)
21
+ resume_lines = clean_text.splitlines(True)
22
+ resume_lines = [re.sub('\s+', ' ', line.strip()) for line in resume_lines if line.strip()]
23
+ return resume_lines
24
+
25
+
requirements.txt ADDED
@@ -0,0 +1,483 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==1.4.0
2
+ accelerate==0.24.1
3
+ aiohttp==3.8.6
4
+ aiosignal==1.3.1
5
+ alabaster==0.7.13
6
+ albumentations==1.3.1
7
+ altair==4.2.2
8
+ anyio==3.7.1
9
+ appdirs==1.4.4
10
+ argon2-cffi==23.1.0
11
+ argon2-cffi-bindings==21.2.0
12
+ array-record==0.5.0
13
+ arviz==0.15.1
14
+ astropy==5.3.4
15
+ astunparse==1.6.3
16
+ async-timeout==4.0.3
17
+ atpublic==4.0
18
+ attrs==23.1.0
19
+ audioread==3.0.1
20
+ autograd==1.6.2
21
+ Babel==2.13.1
22
+ backcall==0.2.0
23
+ beautifulsoup4==4.11.2
24
+ bidict==0.22.1
25
+ bigframes==0.13.0
26
+ bleach==6.1.0
27
+ blinker==1.4
28
+ blis==0.7.11
29
+ blosc2==2.0.0
30
+ bokeh==3.3.0
31
+ bqplot==0.12.42
32
+ branca==0.7.0
33
+ build==1.0.3
34
+ CacheControl==0.13.1
35
+ cachetools==5.3.2
36
+ catalogue==2.0.10
37
+ certifi==2023.7.22
38
+ cffi==1.16.0
39
+ chardet==5.2.0
40
+ charset-normalizer==3.3.2
41
+ chex==0.1.7
42
+ click==8.1.7
43
+ click-plugins==1.1.1
44
+ cligj==0.7.2
45
+ cloudpickle==2.2.1
46
+ cmake==3.27.7
47
+ cmdstanpy==1.2.0
48
+ colorcet==3.0.1
49
+ colorlover==0.3.0
50
+ colour==0.1.5
51
+ community==1.0.0b1
52
+ confection==0.1.3
53
+ cons==0.4.6
54
+ contextlib2==21.6.0
55
+ contourpy==1.2.0
56
+ cryptography==41.0.5
57
+ cufflinks==0.17.3
58
+ cupy-cuda11x==11.0.0
59
+ cvxopt==1.3.2
60
+ cvxpy==1.3.2
61
+ cycler==0.12.1
62
+ cymem==2.0.8
63
+ Cython==3.0.5
64
+ dask==2023.8.1
65
+ dataclasses-json==0.6.2
66
+ datascience==0.17.6
67
+ db-dtypes==1.1.1
68
+ dbus-python==1.2.18
69
+ debugpy==1.6.6
70
+ decorator==4.4.2
71
+ defusedxml==0.7.1
72
+ diskcache==5.6.3
73
+ distributed==2023.8.1
74
+ distro==1.7.0
75
+ dlib==19.24.2
76
+ dm-tree==0.1.8
77
+ docutils==0.18.1
78
+ dopamine-rl==4.0.6
79
+ duckdb==0.9.1
80
+ earthengine-api==0.1.377
81
+ easydict==1.11
82
+ ecos==2.0.12
83
+ editdistance==0.6.2
84
+ eerepr==0.0.4
85
+ en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0-py3-none-any.whl#sha256=83276fc78a70045627144786b52e1f2728ad5e29e5e43916ec37ea9c26a11212
86
+ entrypoints==0.4
87
+ et-xmlfile==1.1.0
88
+ etils==1.5.2
89
+ etuples==0.3.9
90
+ exceptiongroup==1.1.3
91
+ fastai==2.7.13
92
+ fastcore==1.5.29
93
+ fastdownload==0.0.7
94
+ fastjsonschema==2.18.1
95
+ fastprogress==1.0.3
96
+ fastrlock==0.8.2
97
+ filelock==3.13.1
98
+ fiona==1.9.5
99
+ firebase-admin==5.3.0
100
+ flashtext==2.7
101
+ Flask==2.2.5
102
+ flatbuffers==23.5.26
103
+ flax==0.7.5
104
+ folium==0.14.0
105
+ fonttools==4.44.0
106
+ frozendict==2.3.8
107
+ frozenlist==1.4.0
108
+ fsspec==2023.6.0
109
+ future==0.18.3
110
+ gast==0.5.4
111
+ gcsfs==2023.6.0
112
+ GDAL==3.4.3
113
+ gdown==4.6.6
114
+ geemap==0.28.2
115
+ gensim==4.3.2
116
+ geocoder==1.38.1
117
+ geographiclib==2.0
118
+ geopandas==0.13.2
119
+ geopy==2.3.0
120
+ gin-config==0.5.0
121
+ glob2==0.7
122
+ google==2.0.3
123
+ google-api-core==2.11.1
124
+ google-api-python-client==2.84.0
125
+ google-auth==2.17.3
126
+ google-auth-httplib2==0.1.1
127
+ google-auth-oauthlib==1.0.0
128
+ google-cloud-bigquery==3.12.0
129
+ google-cloud-bigquery-connection==1.12.1
130
+ google-cloud-bigquery-storage==2.22.0
131
+ google-cloud-core==2.3.3
132
+ google-cloud-datastore==2.15.2
133
+ google-cloud-firestore==2.11.1
134
+ google-cloud-functions==1.13.3
135
+ google-cloud-iam==2.12.2
136
+ google-cloud-language==2.9.1
137
+ google-cloud-resource-manager==1.10.4
138
+ google-cloud-storage==2.8.0
139
+ google-cloud-translate==3.11.3
140
+ google-colab @ file:///colabtools/dist/google-colab-1.0.0.tar.gz#sha256=a7913e00463ccd8df75a61e36d8582af57905f6b05b88aa768c70a0d631990ef
141
+ google-crc32c==1.5.0
142
+ google-pasta==0.2.0
143
+ google-resumable-media==2.6.0
144
+ googleapis-common-protos==1.61.0
145
+ googledrivedownloader==0.4
146
+ graphviz==0.20.1
147
+ greenlet==3.0.1
148
+ grpc-google-iam-v1==0.12.6
149
+ grpcio==1.59.2
150
+ grpcio-status==1.48.2
151
+ gspread==3.4.2
152
+ gspread-dataframe==3.3.1
153
+ gym==0.25.2
154
+ gym-notices==0.0.8
155
+ h5netcdf==1.3.0
156
+ h5py==3.9.0
157
+ holidays==0.36
158
+ holoviews==1.17.1
159
+ html5lib==1.1
160
+ httpimport==1.3.1
161
+ httplib2==0.22.0
162
+ huggingface-hub==0.17.3
163
+ humanize==4.7.0
164
+ hyperopt==0.2.7
165
+ ibis-framework==6.2.0
166
+ idna==3.4
167
+ imageio==2.31.6
168
+ imageio-ffmpeg==0.4.9
169
+ imagesize==1.4.1
170
+ imbalanced-learn==0.10.1
171
+ imgaug==0.4.0
172
+ importlib-metadata==6.8.0
173
+ importlib-resources==6.1.1
174
+ imutils==0.5.4
175
+ inflect==7.0.0
176
+ iniconfig==2.0.0
177
+ install==1.3.5
178
+ intel-openmp==2023.2.0
179
+ ipyevents==2.0.2
180
+ ipyfilechooser==0.6.0
181
+ ipykernel==5.5.6
182
+ ipyleaflet==0.17.4
183
+ ipython==7.34.0
184
+ ipython-genutils==0.2.0
185
+ ipython-sql==0.5.0
186
+ ipytree==0.2.2
187
+ ipywidgets==7.7.1
188
+ itsdangerous==2.1.2
189
+ jax==0.4.20
190
+ jaxlib @ https://storage.googleapis.com/jax-releases/cuda11/jaxlib-0.4.20+cuda11.cudnn86-cp310-cp310-manylinux2014_x86_64.whl#sha256=01be66238133f884bf5adf15cd7eaaf8445f9d4b056c5c64df28a997a6aff2fe
191
+ jeepney==0.7.1
192
+ jieba==0.42.1
193
+ Jinja2==3.1.2
194
+ joblib==1.3.2
195
+ jsonpatch==1.33
196
+ jsonpickle==3.0.2
197
+ jsonpointer==2.4
198
+ jsonschema==4.19.2
199
+ jsonschema-specifications==2023.7.1
200
+ jupyter-client==6.1.12
201
+ jupyter-console==6.1.0
202
+ jupyter-server==1.24.0
203
+ jupyter_core==5.5.0
204
+ jupyterlab-pygments==0.2.2
205
+ jupyterlab-widgets==3.0.9
206
+ kaggle==1.5.16
207
+ keras==2.14.0
208
+ keyring==23.5.0
209
+ kiwisolver==1.4.5
210
+ langchain==0.0.334
211
+ langcodes==3.3.0
212
+ langsmith==0.0.63
213
+ launchpadlib==1.10.16
214
+ lazr.restfulclient==0.14.4
215
+ lazr.uri==1.0.6
216
+ lazy_loader==0.3
217
+ libclang==16.0.6
218
+ librosa==0.10.1
219
+ lida==0.0.10
220
+ lightgbm==4.1.0
221
+ linkify-it-py==2.0.2
222
+ llmx==0.0.15a0
223
+ llvmlite==0.41.1
224
+ locket==1.0.0
225
+ logical-unification==0.4.6
226
+ lxml==4.9.3
227
+ malloy==2023.1064
228
+ Markdown==3.5.1
229
+ markdown-it-py==3.0.0
230
+ MarkupSafe==2.1.3
231
+ marshmallow==3.20.1
232
+ matplotlib==3.7.1
233
+ matplotlib-inline==0.1.6
234
+ matplotlib-venn==0.11.9
235
+ mdit-py-plugins==0.4.0
236
+ mdurl==0.1.2
237
+ miniKanren==1.0.3
238
+ missingno==0.5.2
239
+ mistune==0.8.4
240
+ mizani==0.9.3
241
+ mkl==2023.2.0
242
+ ml-dtypes==0.2.0
243
+ mlxtend==0.22.0
244
+ more-itertools==10.1.0
245
+ moviepy==1.0.3
246
+ mpmath==1.3.0
247
+ msgpack==1.0.7
248
+ multidict==6.0.4
249
+ multipledispatch==1.0.0
250
+ multitasking==0.0.11
251
+ murmurhash==1.0.10
252
+ music21==9.1.0
253
+ mypy-extensions==1.0.0
254
+ natsort==8.4.0
255
+ nbclassic==1.0.0
256
+ nbclient==0.9.0
257
+ nbconvert==6.5.4
258
+ nbformat==5.9.2
259
+ nest-asyncio==1.5.8
260
+ networkx==3.2.1
261
+ nibabel==4.0.2
262
+ nltk==3.8.1
263
+ notebook==6.5.5
264
+ notebook_shim==0.2.3
265
+ numba==0.58.1
266
+ numexpr==2.8.7
267
+ numpy==1.23.5
268
+ oauth2client==4.1.3
269
+ oauthlib==3.2.2
270
+ opencv-contrib-python==4.8.0.76
271
+ opencv-python==4.8.0.76
272
+ opencv-python-headless==4.8.1.78
273
+ openpyxl==3.1.2
274
+ opt-einsum==3.3.0
275
+ optax==0.1.7
276
+ orbax-checkpoint==0.4.2
277
+ osqp==0.6.2.post8
278
+ packaging==23.2
279
+ pandas==1.5.3
280
+ pandas-datareader==0.10.0
281
+ pandas-gbq==0.17.9
282
+ pandas-stubs==1.5.3.230304
283
+ pandocfilters==1.5.0
284
+ panel==1.3.1
285
+ param==2.0.0
286
+ parso==0.8.3
287
+ parsy==2.1
288
+ partd==1.4.1
289
+ pathlib==1.0.1
290
+ pathy==0.10.3
291
+ patsy==0.5.3
292
+ peewee==3.17.0
293
+ pexpect==4.8.0
294
+ pickleshare==0.7.5
295
+ Pillow==9.4.0
296
+ pip-tools==6.13.0
297
+ platformdirs==3.11.0
298
+ plotly==5.15.0
299
+ plotnine==0.12.4
300
+ pluggy==1.3.0
301
+ polars==0.17.3
302
+ pooch==1.8.0
303
+ portpicker==1.5.2
304
+ prefetch-generator==1.0.3
305
+ preshed==3.0.9
306
+ prettytable==3.9.0
307
+ proglog==0.1.10
308
+ progressbar2==4.2.0
309
+ prometheus-client==0.18.0
310
+ promise==2.3
311
+ prompt-toolkit==3.0.39
312
+ prophet==1.1.5
313
+ proto-plus==1.22.3
314
+ protobuf==3.20.3
315
+ psutil==5.9.5
316
+ psycopg2==2.9.9
317
+ ptyprocess==0.7.0
318
+ py-cpuinfo==9.0.0
319
+ py4j==0.10.9.7
320
+ pyarrow==9.0.0
321
+ pyasn1==0.5.0
322
+ pyasn1-modules==0.3.0
323
+ pycocotools==2.0.7
324
+ pycparser==2.21
325
+ pyct==0.5.0
326
+ pydantic==1.10.13
327
+ pydata-google-auth==1.8.2
328
+ pydot==1.4.2
329
+ pydot-ng==2.0.0
330
+ pydotplus==2.0.2
331
+ PyDrive==1.3.1
332
+ PyDrive2==1.6.3
333
+ pyerfa==2.0.1.1
334
+ pygame==2.5.2
335
+ Pygments==2.16.1
336
+ PyGObject==3.42.1
337
+ PyJWT==2.3.0
338
+ pymc==5.7.2
339
+ pymystem3==0.2.0
340
+ PyOpenGL==3.1.7
341
+ pyOpenSSL==23.3.0
342
+ pyparsing==3.1.1
343
+ pypdfium2==4.24.0
344
+ pyperclip==1.8.2
345
+ pyproj==3.6.1
346
+ pyproject_hooks==1.0.0
347
+ pyshp==2.3.1
348
+ PySocks==1.7.1
349
+ pytensor==2.14.2
350
+ pytest==7.4.3
351
+ python-apt==0.0.0
352
+ python-box==7.1.1
353
+ python-dateutil==2.8.2
354
+ python-louvain==0.16
355
+ python-slugify==8.0.1
356
+ python-utils==3.8.1
357
+ pytz==2023.3.post1
358
+ pyviz_comms==3.0.0
359
+ PyWavelets==1.4.1
360
+ PyYAML==6.0.1
361
+ pyzmq==23.2.1
362
+ qdldl==0.1.7.post0
363
+ qudida==0.0.4
364
+ ratelim==0.1.6
365
+ referencing==0.30.2
366
+ regex==2023.6.3
367
+ requests==2.31.0
368
+ requests-oauthlib==1.3.1
369
+ requirements-parser==0.5.0
370
+ rich==13.6.0
371
+ rpds-py==0.12.0
372
+ rpy2==3.4.2
373
+ rsa==4.9
374
+ safetensors==0.4.0
375
+ scikit-image==0.19.3
376
+ scikit-learn==1.2.2
377
+ scipy==1.11.3
378
+ scooby==0.9.2
379
+ scs==3.2.4
380
+ seaborn==0.12.2
381
+ SecretStorage==3.3.1
382
+ Send2Trash==1.8.2
383
+ sentencepiece==0.1.99
384
+ shapely==2.0.2
385
+ six==1.16.0
386
+ sklearn-pandas==2.2.0
387
+ smart-open==6.4.0
388
+ sniffio==1.3.0
389
+ snowballstemmer==2.2.0
390
+ sortedcontainers==2.4.0
391
+ soundfile==0.12.1
392
+ soupsieve==2.5
393
+ soxr==0.3.7
394
+ spacy==3.6.1
395
+ spacy-legacy==3.0.12
396
+ spacy-loggers==1.0.5
397
+ Sphinx==5.0.2
398
+ sphinxcontrib-applehelp==1.0.7
399
+ sphinxcontrib-devhelp==1.0.5
400
+ sphinxcontrib-htmlhelp==2.0.4
401
+ sphinxcontrib-jsmath==1.0.1
402
+ sphinxcontrib-qthelp==1.0.6
403
+ sphinxcontrib-serializinghtml==1.1.9
404
+ SQLAlchemy==2.0.23
405
+ sqlglot==17.16.2
406
+ sqlparse==0.4.4
407
+ srsly==2.4.8
408
+ stanio==0.3.0
409
+ statsmodels==0.14.0
410
+ sympy==1.12
411
+ tables==3.8.0
412
+ tabulate==0.9.0
413
+ tbb==2021.10.0
414
+ tblib==3.0.0
415
+ tenacity==8.2.3
416
+ tensorboard==2.14.1
417
+ tensorboard-data-server==0.7.2
418
+ tensorflow==2.14.0
419
+ tensorflow-datasets==4.9.3
420
+ tensorflow-estimator==2.14.0
421
+ tensorflow-gcs-config==2.14.0
422
+ tensorflow-hub==0.15.0
423
+ tensorflow-io-gcs-filesystem==0.34.0
424
+ tensorflow-metadata==1.14.0
425
+ tensorflow-probability==0.22.0
426
+ tensorstore==0.1.45
427
+ termcolor==2.3.0
428
+ terminado==0.17.1
429
+ text-unidecode==1.3
430
+ textblob==0.17.1
431
+ tf-slim==1.1.0
432
+ thinc==8.1.12
433
+ threadpoolctl==3.2.0
434
+ tifffile==2023.9.26
435
+ tinycss2==1.2.1
436
+ tokenizers==0.14.1
437
+ toml==0.10.2
438
+ tomli==2.0.1
439
+ toolz==0.12.0
440
+ torch @ https://download.pytorch.org/whl/cu118/torch-2.1.0%2Bcu118-cp310-cp310-linux_x86_64.whl#sha256=a81b554184492005543ddc32e96469f9369d778dedd195d73bda9bed407d6589
441
+ torchaudio @ https://download.pytorch.org/whl/cu118/torchaudio-2.1.0%2Bcu118-cp310-cp310-linux_x86_64.whl#sha256=cdfd0a129406155eee595f408cafbb92589652da4090d1d2040f5453d4cae71f
442
+ torchdata==0.7.0
443
+ torchsummary==1.5.1
444
+ torchtext==0.16.0
445
+ torchvision @ https://download.pytorch.org/whl/cu118/torchvision-0.16.0%2Bcu118-cp310-cp310-linux_x86_64.whl#sha256=033712f65d45afe806676c4129dfe601ad1321d9e092df62b15847c02d4061dc
446
+ tornado==6.3.2
447
+ tqdm==4.66.1
448
+ traitlets==5.7.1
449
+ traittypes==0.2.1
450
+ transformers==4.35.0
451
+ triton==2.1.0
452
+ tweepy==4.14.0
453
+ typer==0.9.0
454
+ types-pytz==2023.3.1.1
455
+ types-setuptools==68.2.0.1
456
+ typing-inspect==0.9.0
457
+ typing_extensions==4.5.0
458
+ tzlocal==5.2
459
+ uc-micro-py==1.0.2
460
+ uritemplate==4.1.1
461
+ urllib3==2.0.7
462
+ vega-datasets==0.9.0
463
+ wadllib==1.3.6
464
+ wasabi==1.1.2
465
+ wcwidth==0.2.9
466
+ webcolors==1.13
467
+ webencodings==0.5.1
468
+ websocket-client==1.6.4
469
+ Werkzeug==3.0.1
470
+ widgetsnbextension==3.6.6
471
+ wordcloud==1.9.2
472
+ wrapt==1.14.1
473
+ xarray==2023.7.0
474
+ xarray-einstats==0.6.0
475
+ xgboost==2.0.1
476
+ xlrd==2.0.1
477
+ xxhash==3.4.1
478
+ xyzservices==2023.10.1
479
+ yarl==1.9.2
480
+ yellowbrick==1.5
481
+ yfinance==0.2.31
482
+ zict==3.0.0
483
+ zipp==3.17.0
sections.json ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "section_headers": {
3
+ "objective": [
4
+ "career goal",
5
+ "objective",
6
+ "career objective",
7
+ "employment objective",
8
+ "professional objective",
9
+ "summary",
10
+ "summary of qualifications"
11
+ ],
12
+ "work_and_employment": [
13
+ "employment history",
14
+ "employment data",
15
+ "career summary",
16
+ "work history",
17
+ "working history",
18
+ "work experience",
19
+ "experience",
20
+ "professional experience",
21
+ "professional background",
22
+ "professional employment",
23
+ "additional experience",
24
+ "career related experience",
25
+ "professional employment history",
26
+ "related experience",
27
+ "relevant experience",
28
+ "programming experience",
29
+ "freelance",
30
+ "freelance experience",
31
+ "army experience",
32
+ "military experience",
33
+ "military background"
34
+ ],
35
+ "education_and_training": [
36
+ "academic background",
37
+ "academic experience",
38
+ "programs",
39
+ "courses",
40
+ "related courses",
41
+ "education",
42
+ "educational background",
43
+ "educational qualifications",
44
+ "educational training",
45
+ "education and training",
46
+ "training",
47
+ "academic training",
48
+ "Academic Qualification",
49
+ "professional training",
50
+ "course project experience",
51
+ "related course projects",
52
+ "internship experience",
53
+ "internships",
54
+ "apprenticeships",
55
+ "college activities",
56
+ "certifications",
57
+ "special training"
58
+ ],
59
+ "skills": [
60
+ "credentials",
61
+ "qualifications",
62
+ "areas of experience",
63
+ "areas of expertise",
64
+ "areas of knowledge",
65
+ "skills",
66
+ "Skills",
67
+ "other skills",
68
+ "other abilities",
69
+ "career related skills",
70
+ "professional skills",
71
+ "specialized skills",
72
+ "technical skills",
73
+ "computer skills",
74
+ "personal skills",
75
+ "computer knowledge",
76
+ "technologies",
77
+ "technical experience",
78
+ "proficiencies",
79
+ "languages",
80
+ "language competencies and skills",
81
+ "programming languages",
82
+ "competencies"
83
+ ],
84
+ "misc": [
85
+ "activities and honors",
86
+ "activities",
87
+ "affiliations",
88
+ "professional affiliations",
89
+ "associations",
90
+ "professional associations",
91
+ "memberships",
92
+ "professional memberships",
93
+ "athletic involvement",
94
+ "community involvement",
95
+ "refere",
96
+ "civic activities",
97
+ "extra-Curricular activities",
98
+ "professional activities",
99
+ "volunteer work",
100
+ "volunteer experience",
101
+ "additional information",
102
+ "interests"
103
+ ],
104
+ "accomplishments": [
105
+ "awards",
106
+ "achievement",
107
+ "awards and achievements",
108
+ "licenses",
109
+ "presentations",
110
+ "conference presentations",
111
+ "conventions",
112
+ "dissertations",
113
+ "exhibits",
114
+ "papers",
115
+ "publications",
116
+ "professional publications",
117
+ "research experience",
118
+ "research grants",
119
+ "projects",
120
+ "research projects",
121
+ "personal projects",
122
+ "current research interests",
123
+ "thesis",
124
+ "theses"
125
+ ]
126
+ }
127
+ }
segmenter.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flashtext import KeywordProcessor
2
+ import json
3
+ class ResumeSegmenter():
4
+ class ResumeSegmenter():
5
+ def __init__(self):
6
+ self.resume_segments = {
7
+ 'objective': [],
8
+ 'work_and_employment': [],
9
+ 'education_and_training': [],
10
+ 'skills': [],
11
+ 'accomplishments': [],
12
+ 'misc': []
13
+ }
14
+ self.resume_indices = []
15
+
16
+ def get_average_line_len(self, lines):
17
+ sum = 0
18
+ for line in lines:
19
+ sum+=len(line)
20
+ return sum / len(lines)
21
+
22
+ def get_average_words_per_line(self, lines):
23
+ sum = 0
24
+ for line in lines:
25
+ #other stopwords too?
26
+ sum+= len(line.split(' '))
27
+ return sum/ len(lines)
28
+
29
+ def find_segment_indices(self, text_list):
30
+ with open(r"./sections.json") as f:
31
+ data = json.load(f)
32
+ section_headers = data["section_headers"]
33
+ f.close()
34
+ keyword_processor = KeywordProcessor()
35
+ keyword_processor.add_keywords_from_dict(keyword_dict=section_headers)
36
+ average_words_per_line = self.get_average_words_per_line(text_list)
37
+
38
+ for i, line in enumerate(text_list):
39
+ if line[0].islower() or line[-1] == '.':
40
+ continue
41
+ kys = keyword_processor.extract_keywords(line)
42
+ if len(kys) > 0:
43
+ #other stopwords? from where? nltk lib ? pos tagger?
44
+ if len(line.split(" ")) > average_words_per_line * 0.75:
45
+ continue
46
+ #is it necessary to keep the actual raw keyword?
47
+ self.resume_indices.append(i)
48
+ self.resume_segments[kys[0]].append(i)
49
+
50
+ def slice_segments(self, lines):
51
+ sections = {}
52
+ if len(self.resume_indices) == 0:
53
+ return None
54
+
55
+ for section, points in self.resume_segments.items():
56
+ if len(points) == 0: continue
57
+ start_point = points[0]
58
+ tmp_end_point = points[-1]
59
+ end_point = self.resume_indices[min(self.resume_indices.index(tmp_end_point)+1,
60
+ len(self.resume_indices)-1)]
61
+ if start_point == self.resume_indices[-1]:
62
+ end_point = len(lines)
63
+ sections[section] = (start_point, end_point)
64
+ sections["basics_info"] = (0, self.resume_indices[0])
65
+ return sections
66
+
67
+ def get_interval_intersection(self, sections, interval):
68
+ for section in sections:
69
+ s = section[1]
70
+ if s[0] >= interval[1] or interval[0] >= s[1]:
71
+ return None
72
+ else:
73
+ start = max(s[0], interval[0])
74
+ end = min(s[1], interval[1])
75
+ return [start, end], section
76
+ def segment(self, resume_lines):
77
+ self.find_segment_indices(resume_lines)
78
+ sections = self.slice_segments(resume_lines)
79
+ #whats the naming convention here sections_list or list_sections???
80
+ sections_list = [(k, v) for k,v in sections.items() if len(v) > 0 ]
81
+ intersection_intervals = []
82
+
83
+ for i, s in enumerate(sections_list[:-1]):
84
+ result = self.get_interval_intersection(sections_list[i+1:], s[1])
85
+ if result is None:
86
+ continue
87
+ else:
88
+ a,b = result
89
+ print(a,b,s[0])
90
+ intersection_intervals.append((a,b,s[0]))
91
+
92
+ if len(intersection_intervals) > 0:
93
+ print("there are intersections", intersection_intervals)
94
+ #needs last method of cleaning overlapping intervals with zero shot
95
+ #classifier + substract intervals
96
+ return sections
97
+
98
+ def get_parsed_sections(self, resume_lines):
99
+ text_segments = {}
100
+ sections = self.segment(resume_lines)
101
+ for header_title, section in sections.items():
102
+ lines = resume_lines[section[0]:section[1]]
103
+ text_segments[header_title] = lines
104
+
105
+ return text_segments, sections