Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Commit
·
25510ce
1
Parent(s):
2021cd6
Add deduplication and filtering for knowledge predictions
Browse files
skills_extraction/skills_extraction.py
CHANGED
@@ -1,8 +1,11 @@
|
|
|
|
1 |
import string
|
2 |
import numpy as np
|
3 |
from fastapi import FastAPI
|
4 |
from pydantic import BaseModel
|
5 |
from transformers import pipeline, AutoTokenizer
|
|
|
|
|
6 |
|
7 |
# Initialize FastAPI
|
8 |
app = FastAPI()
|
@@ -76,6 +79,34 @@ def chunk_text(text, tokenizer, max_length=500, overlap=100):
|
|
76 |
return chunks
|
77 |
|
78 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
@app.post("/predict_knowledge")
|
80 |
def predict_knowledge(input_data: TextInput):
|
81 |
# Clean non-printable chars
|
@@ -85,7 +116,18 @@ def predict_knowledge(input_data: TextInput):
|
|
85 |
for chunk in chunks:
|
86 |
preds = knowledge_nlp(chunk)
|
87 |
all_preds.extend(convert_from_numpy(preds))
|
88 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
|
90 |
|
91 |
@app.post("/predict_skills")
|
|
|
1 |
+
import re
|
2 |
import string
|
3 |
import numpy as np
|
4 |
from fastapi import FastAPI
|
5 |
from pydantic import BaseModel
|
6 |
from transformers import pipeline, AutoTokenizer
|
7 |
+
from semantic_similarity.semantic_similarity import model as embedding_model
|
8 |
+
from sentence_transformers import util
|
9 |
|
10 |
# Initialize FastAPI
|
11 |
app = FastAPI()
|
|
|
79 |
return chunks
|
80 |
|
81 |
|
82 |
+
def deduplicate_by_similarity(items, embeddings, threshold=0.7):
|
83 |
+
keep = []
|
84 |
+
used = set()
|
85 |
+
sim_matrix = util.cos_sim(embeddings, embeddings)
|
86 |
+
|
87 |
+
for i in range(len(items)):
|
88 |
+
if i in used:
|
89 |
+
continue
|
90 |
+
keep.append(items[i])
|
91 |
+
for j in range(i + 1, len(items)):
|
92 |
+
if sim_matrix[i][j] > threshold:
|
93 |
+
used.add(j)
|
94 |
+
return keep
|
95 |
+
|
96 |
+
|
97 |
+
def filter_knowledge(results):
|
98 |
+
# to_remove = ['-', '/', '(', ')', 'and', 'or', 'the', 'a', 'an']
|
99 |
+
filtered_results = []
|
100 |
+
for result in results:
|
101 |
+
result["name"] = result["name"].strip()
|
102 |
+
result["name"] = re.sub(r'[^\w\s]', '', result["name"])
|
103 |
+
result["name"] = re.sub(r'\s+', ' ', result["name"])
|
104 |
+
if len(result["name"].split()) > 3 or len(result["name"]) <= 2 or result['confidence'] < 0.95:
|
105 |
+
continue
|
106 |
+
filtered_results.append(result)
|
107 |
+
return filtered_results
|
108 |
+
|
109 |
+
|
110 |
@app.post("/predict_knowledge")
|
111 |
def predict_knowledge(input_data: TextInput):
|
112 |
# Clean non-printable chars
|
|
|
116 |
for chunk in chunks:
|
117 |
preds = knowledge_nlp(chunk)
|
118 |
all_preds.extend(convert_from_numpy(preds))
|
119 |
+
result = merge_BI_and_get_results(all_preds)
|
120 |
+
if not result:
|
121 |
+
return {"knowledge_predictions": []}
|
122 |
+
|
123 |
+
result = filter_knowledge(result)
|
124 |
+
|
125 |
+
knowledge_names = [r["name"] for r in result]
|
126 |
+
embeddings_tensor = embedding_model.encode(knowledge_names, convert_to_tensor=True)
|
127 |
+
embeddings = embeddings_tensor.cpu().numpy()
|
128 |
+
deduped_results = deduplicate_by_similarity(result, embeddings)
|
129 |
+
|
130 |
+
return {"knowledge_predictions": deduped_results}
|
131 |
|
132 |
|
133 |
@app.post("/predict_skills")
|